In [1]:
!pip install unsloth

Collecting unsloth
  Downloading unsloth-2025.4.3-py3-none-any.whl.metadata (46 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m46.4/46.4 kB[0m [31m852.7 kB/s[0m eta [36m0:00:00[0m [36m0:00:01[0m
[?25hCollecting unsloth_zoo>=2025.4.2 (from unsloth)
  Downloading unsloth_zoo-2025.4.2-py3-none-any.whl.metadata (8.0 kB)
Collecting torch>=2.4.0 (from unsloth)
  Downloading torch-2.7.0-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (29 kB)
Collecting xformers>=0.0.27.post2 (from unsloth)
  Downloading xformers-0.0.30-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (1.0 kB)
Collecting bitsandbytes (from unsloth)
  Downloading bitsandbytes-0.45.5-py3-none-manylinux_2_24_x86_64.whl.metadata (5.0 kB)
Collecting triton>=3.0.0 (from unsloth)
  Downloading triton-3.3.0-cp310-cp310-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (1.5 kB)
Collecting tyro (from unsloth)
  Downloading tyro-0.9.19-py3-none-any.whl.metadata (9.9 kB)
Collecting transformers!=4.47.0,>=4.

In [2]:
!pip install datasets

[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.3.1[0m[39;49m -> [0m[32;49m25.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython -m pip install --upgrade pip[0m


In [3]:
from unsloth import FastLanguageModel
from transformers import AutoTokenizer
from tqdm import tqdm
import json

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!


In [4]:
# Load your LoRA fine-tuned model
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "./checkpoint-100",  # Replace with your local fine-tuned checkpoint path
    max_seq_length = 2048,
    dtype = None,
    load_in_4bit = True,
)

==((====))==  Unsloth 2025.4.3: Fast Mistral patching. Transformers: 4.51.3.
   \\   /|    NVIDIA H100 80GB HBM3. Num GPUs = 1. Max memory: 79.096 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.7.0+cu126. CUDA: 9.0. CUDA Toolkit: 12.6. Triton: 3.3.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.30. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/4.13G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/155 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.02k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/438 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.80M [00:00<?, ?B/s]

Unsloth 2025.4.3 patched 32 layers with 32 QKV layers, 32 O layers and 32 MLP layers.


In [5]:
from datasets import load_dataset
from tqdm import tqdm
import pandas as pd

In [6]:
# 1. Load TruthfulQA
truthfulqa = load_dataset("truthfulqa/truthful_qa", "generation")["validation"]

README.md:   0%|          | 0.00/9.59k [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/223k [00:00<?, ?B/s]

Generating validation split:   0%|          | 0/817 [00:00<?, ? examples/s]

In [7]:
# 2. Convert to Open-Instruct format (only using best answer)
formatted_eval_data = []
 
for sample in tqdm(truthfulqa, desc="Formatting TruthfulQA"):
    question = sample["question"]
    best_answer = sample["best_answer"]
    formatted_eval_data.append({
        "instruction": question,
        "input": "",
        "output": best_answer
    })

Formatting TruthfulQA: 100%|██████████| 817/817 [00:00<00:00, 20146.19it/s]


In [8]:
# 3. Convert to a DataFrame
df_eval = pd.DataFrame(formatted_eval_data)
df_eval.to_json("truthfulqa_openinstruct_eval.json", orient="records", lines=True)
 
print("✅ Saved formatted evaluation dataset to 'truthfulqa_openinstruct_eval.json'")

✅ Saved formatted evaluation dataset to 'truthfulqa_openinstruct_eval.json'


In [9]:
# Load the evaluation dataset
with open("truthfulqa_openinstruct_eval.json", "r") as f:
    eval_data = [json.loads(line) for line in f]
 

In [11]:
from torch.utils.data import DataLoader

In [14]:
import torch 
from tqdm import tqdm 

# Prepare prompts
prompts = [f"### Instruction:\n{sample['instruction']}\n\n### Response:\n" for sample in eval_data]

# Set batch size based on your memory budget (H100 can usually handle 16–64+)
BATCH_SIZE = 16
dataloader = DataLoader(prompts, batch_size=BATCH_SIZE)
 
model_outputs = []
 
with torch.inference_mode():
    for i, batch_prompts in enumerate(tqdm(dataloader, desc="Running inference in batches")):
        inputs = tokenizer(
            batch_prompts,
            return_tensors="pt",
            padding=True,
            truncation=True,
            max_length=1024  # adjust if needed
        ).to("cuda")
 
        outputs = model.generate(
            **inputs,
            max_new_tokens=150,
            do_sample=False,
            use_cache=True
        )
 
        decoded = tokenizer.batch_decode(outputs, skip_special_tokens=True)
 
        for j, full_output in enumerate(decoded):
            if "### Response:" in full_output:
                response = full_output.split("### Response:")[-1].strip()
            else:
                response = full_output.strip()
            idx = i * BATCH_SIZE + j
            model_outputs.append({
                "instruction": eval_data[idx]['instruction'],
                "best_answer": eval_data[idx]['output'],
                "model_output": response
            })
 
# Save model outputs
with open("FINAL_MODEL_RESPONSES.json", "w") as f:
    json.dump(model_outputs, f, indent=2)
 
print("✅ Inference completed and saved.")

Running inference in batches: 100%|██████████| 52/52 [05:25<00:00,  6.26s/it]

✅ Inference completed and saved.



