# Week 7: Synthetic Data Generation & QLoRA Fine-Tuning (Academic Q&A)
**Pipeline:** abstracts → GPT-4 Q&A → JSONL → QLoRA (Unsloth) → Evaluation

## 0. Install requirements

In [None]:
# If on Colab, use the provided requirements file:
# from google.colab import files
# files.upload()  # upload requirements.txt if needed
# !pip install -r requirements.txt
# Or install manually:
# !pip install unsloth transformers peft bitsandbytes datasets accelerate pandas


## 1. Config & Imports

In [None]:
import os, json
from datasets import load_dataset
from pathlib import Path

SYSTEM_PROMPT = "You are a helpful academic Q&A assistant specialized in scholarly content."
DATA_DIR = "./data"  # change if needed
JSONL_PATH = f"{DATA_DIR}/synthetic_qa.jsonl"
MODEL_NAME = "unsloth/llama-3.1-7b-unsloth-bnb-4bit"
OUTPUT_DIR = "./llama3-7b-qlora-finetuned"

os.makedirs(DATA_DIR, exist_ok=True)
print("DATA_DIR:", DATA_DIR)


## 2. (Optional) Generate Q&A with GPT-4

In [None]:
# Placeholder structure for aggregated Q&A list:
# papers_qas = [
#   {
#     "paper_id": "arxiv:2401.12345",
#     "title": "Title...",
#     "qas": [
#       {"question": "What is the main contribution?", "answer": "The paper proposes ..."},
#       # total ~5 per paper
#     ]
#   },
#   # ... ~100 papers
# ]
# with open(f"{DATA_DIR}/papers_qas.json", "w", encoding="utf-8") as f:
#     json.dump(papers_qas, f, ensure_ascii=False, indent=2)
print("Skip if you already created Q&A JSON.")


## 3. Convert Q&A to instruction-tuning JSONL

In [None]:
import json

def to_jsonl_from_aggregated(input_path, jsonl_path):
    with open(input_path, "r", encoding="utf-8") as f:
        items = json.load(f)
    n = 0
    with open(jsonl_path, "w", encoding="utf-8") as w:
        for it in items:
            for qa in it.get("qas", []):
                q = qa["question"].strip()
                a = qa["answer"].strip()
                full_prompt = f"<|system|>{SYSTEM_PROMPT}<|user|>{q}<|assistant|>{a}"
                w.write(json.dumps({"text": full_prompt}, ensure_ascii=False) + "\n")
                n += 1
    print(f"JSONL written: {jsonl_path} with {n} pairs")

# Example (uncomment when you have papers_qas.json):
# to_jsonl_from_aggregated(f"{DATA_DIR}/papers_qas.json", JSONL_PATH)


## 4. QLoRA Fine-Tuning (Unsloth)

In [None]:
from unsloth import FastLanguageModel, SFTTrainer
from transformers import AutoTokenizer, TrainingArguments

# Load model & tokenizer
model = FastLanguageModel.from_pretrained(MODEL_NAME)
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=False)

# Load dataset
dataset = load_dataset("json", data_files=JSONL_PATH, split="train")

trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=dataset,
    dataset_text_field="text",
    args=TrainingArguments(
        output_dir=OUTPUT_DIR,
        per_device_train_batch_size=4,
        gradient_accumulation_steps=4,
        num_train_epochs=2,
        learning_rate=2e-4,
        fp16=True,
        logging_steps=50,
        save_strategy="epoch"
    ),
)
trainer.train()
model.save_pretrained(OUTPUT_DIR)
print("Saved to", OUTPUT_DIR)


## 5. Evaluation (Base vs Fine-tuned)

In [None]:
from unsloth import FastLanguageModel
from transformers import AutoTokenizer

TEST_QUESTIONS = [
    "Summarize the main hypothesis of the paper on [topic].",
    "How did the authors evaluate their model? What metrics were used?",
    "What are the key limitations discussed by the authors?",
    "Which baselines were compared and how did the proposed method perform?",
    "Describe the core algorithmic novelty introduced in the study.",
    "What datasets were used and why are they appropriate?",
    "How does the method generalize across tasks or domains?",
    "What ablation studies were conducted and what did they show?",
    "What future work do the authors suggest?",
    "Explain the difference between this method and a closely related approach."
]

def answer_question(model, tokenizer, q):
    prompt = f"<|system|>{SYSTEM_PROMPT}<|user|>{q}<|assistant|>"
    inputs = tokenizer(prompt, return_tensors="pt")
    # If running on GPU: inputs = {k: v.cuda() for k, v in inputs.items()}
    output_ids = model.generate(**inputs, max_new_tokens=200, do_sample=False)
    text = tokenizer.decode(output_ids[0], skip_special_tokens=False)
    if "<|assistant|>" in text:
        text = text.split("<|assistant|>", 1)[-1].strip()
    return text

# Load base and ft models
base_model = FastLanguageModel.from_pretrained(MODEL_NAME)
ft_model   = FastLanguageModel.from_pretrained(OUTPUT_DIR)
tok        = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=False)

rows = []
for i, q in enumerate(TEST_QUESTIONS, 1):
    base_ans = answer_question(base_model, tok, q)
    ft_ans   = answer_question(ft_model, tok, q)
    print(f"Q{i}: {q}")
    print("Base:", base_ans[:800])
    print("FT  :", ft_ans[:800])
    print("-"*80)
    rows.append({"id": i, "question": q, "base_answer": base_ans, "ft_answer": ft_ans})

# Optionally write to CSV for manual scoring
import pandas as pd
pd.DataFrame(rows).to_csv("eval_raw_outputs.csv", index=False)
print("Saved eval_raw_outputs.csv")
