# Quick LoPA Inference (HF model)

This notebook loads the HF repo `jeongseokoh/Mistral-7B-Instruct-v0.2-LOPA-partial4-0specials` and runs inference using `infer_lopa_pure.py`'s `lopa_generate`.

In [None]:
# Optional: install dependencies if needed (uncomment)
# %pip install -q transformers accelerate peft

%cd /data2/jeongseokoh/jeongseokoh/LatentCOMP_cleaned

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer
from huggingface_hub import snapshot_download
import os, torch, pathlib

# Import LoPA generator from the local script
from infer_lopa_pure import lopa_generate

# HF repo that holds best/ (tokenizer at root, base/ and lora/ as subfolders)
repo_id = 'jeongseokoh/Mistral-7B-Instruct-v0.2-LOPA-partial4-0specials'
# Base backbone to merge LoRA into
base_model_id = 'mistralai/Mistral-7B-Instruct-v0.2'

device = 'cuda' if torch.cuda.is_available() else 'cpu'
dtype = (torch.bfloat16 if (device=='cuda' and torch.cuda.is_bf16_supported()) else
         (torch.float16 if device=='cuda' else torch.float32))

# Download the HF repo snapshot locally so we can access subfolders
best_dir = snapshot_download(repo_id=repo_id)
print('Snapshot path:', best_dir)

# Load tokenizer from the snapshot root (to pick up chat template)
tokenizer = AutoTokenizer.from_pretrained(best_dir, use_fast=True)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = 'right'

# Load base model, then merge LoRA if present
model = AutoModelForCausalLM.from_pretrained(base_model_id, trust_remote_code=False, torch_dtype=dtype)
lora_dir = os.path.join(best_dir, 'lora')
if os.path.isdir(lora_dir) and any(pathlib.Path(lora_dir).glob('adapter*')):
    from peft import PeftModel
    peft = PeftModel.from_pretrained(model, lora_dir)
    model = peft.merge_and_unload()
else:
    # Fallback: if base weights are inside the snapshot (best_dir/base), prefer them
    base_dir = os.path.join(best_dir, 'base')
    if os.path.isdir(base_dir):
        model = AutoModelForCausalLM.from_pretrained(base_dir, trust_remote_code=False, torch_dtype=dtype)

model = model.to(device).eval()
for k in ('attn_implementation', '_attn_implementation'):
    try:
        setattr(model.config, k, 'eager')
        setattr(model.model.config, k, 'eager')
    except Exception:
        pass
print('Loaded tokenizer and model. Device:', device, '| dtype:', dtype)

In [None]:
# Example inputs
system = 'You are a helpful assistant that answers questions based on the given document. '
document = 'The Eiffel Tower is a wrought-iron lattice tower in Paris, France. It is named after the engineer Gustave Eiffel.'
question = 'Which city is the Eiffel Tower located in?'

# LoPA prefill layers (K). Use 4 to match the partial4 model.
K = 4

with torch.inference_mode():
    answer = lopa_generate(
        model, tokenizer,
        system=system, document=document, question=question,
        K=K, device=device,
        max_new_tokens=128, min_length=8,
        temperature=0.7, top_p=0.95, top_k=None,
        do_sample=True, debug=False,
    )
print('Answer:', answer)

---
### Alternative: CLI (inside notebook)
Download the repo with `snapshot_download` and pass that path as `--best_dir`, and set `--model_name` to the original base backbone (Mistral).

In [None]:
# Example CLI usage (uncomment to run)
# from huggingface_hub import snapshot_download
# best_dir = snapshot_download(repo_id='jeongseokoh/Mistral-7B-Instruct-v0.2-LOPA-partial4-0specials')
# !python infer_lopa_pure.py \
#   --best_dir {best_dir} \
#   --model_name mistralai/Mistral-7B-Instruct-v0.2 \
#   --prefill_layers 4 \
#   --document 'The Eiffel Tower is a wrought-iron lattice tower in Paris, France.' \
#   --question 'Which city is the Eiffel Tower located in?' \
#   --max_new_tokens 128 --do_sample --top_p 0.95 --temperature 0.7