<a href="https://colab.research.google.com/github/kdhenderson/msds_colab_notebooks/blob/main/MSDS_Workshop_Fine_Tuning_Part_2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# ✅ 1. Install required packages
!pip install -q unsloth bitsandbytes accelerate xformers==0.0.29.post3 peft trl triton cut_cross_entropy unsloth_zoo
!pip install -q sentencepiece protobuf "datasets>=3.4.1" huggingface_hub hf_transfer


In [None]:
# ✅ 2. Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')


In [None]:

# ✅ 3. Import and load model
from unsloth import FastLanguageModel
max_seq_length = 2048
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/Llama-3.2-1B-Instruct",
    max_seq_length = max_seq_length,
    dtype = None,
    load_in_4bit = True,
)

# ✅ 4. Apply LoRA
model = FastLanguageModel.get_peft_model(
    model,
    r = 8,
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
    lora_alpha = 2,
    lora_dropout = 0,
    bias = "none",
    use_gradient_checkpointing = "unsloth",
    random_state = 42,
)

# ✅ 5. Load dataset
from datasets import load_dataset
train_path = "/content/drive/MyDrive/ds_6371_qna_500.jsonl"
test_path = "/content/drive/MyDrive/ds_6371_test_qna_whamo.jsonl"

train_ds = load_dataset("json", data_files={"train": train_path}, split="train")
test_ds = load_dataset("json", data_files={"test": test_path}, split="test")

# ✅ 6. Format dataset

from unsloth.chat_templates import get_chat_template, standardize_sharegpt

# Convert ShareGPT-style to HuggingFace-style
train_ds = standardize_sharegpt(train_ds)
test_ds = standardize_sharegpt(test_ds)

# Apply chat template
tokenizer = get_chat_template(tokenizer, chat_template = "llama-3.1")

def format_prompt(example):
    text = tokenizer.apply_chat_template(
        example["conversations"],
        tokenize = False,
        add_generation_prompt = False,
    )
    return { "text": text }

train_ds = train_ds.map(format_prompt)
test_ds = test_ds.map(format_prompt)

# ✅ 7. Tokenize dataset
from unsloth.chat_templates import get_chat_template
tokenizer = get_chat_template(tokenizer, chat_template="llama-3.1")

def tokenize(example):
    return tokenizer(example["text"], truncation=True)

train_ds = train_ds.map(tokenize, remove_columns=train_ds.column_names, num_proc=2)
test_inputs = tokenizer(test_ds["text"], return_tensors="pt", padding=True, truncation=True).to("cuda")

# ✅ 8. Train model using TRL
from trl import SFTConfig, SFTTrainer
from transformers import DataCollatorForSeq2Seq

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = train_ds,
    dataset_text_field = "input_ids",
    max_seq_length = max_seq_length,
    data_collator = DataCollatorForSeq2Seq(tokenizer),
    dataset_num_proc = 2,
    packing = False,
    args = SFTConfig(
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 4,
        warmup_steps = 5,
        max_steps = 100,  # You can increase this for better results
        learning_rate = 2e-4,
        logging_steps = 5,
        optim = "adamw_8bit",
        output_dir = "outputs",
        report_to = "none",
    ),
)

# ✅ 9. Mask user input so loss is only calculated on assistant output
from unsloth.chat_templates import train_on_responses_only

trainer = train_on_responses_only(
    trainer,
    instruction_part = "<|start_header_id|>user<|end_header_id|>\n\n",
    response_part = "<|start_header_id|>assistant<|end_header_id|>\n\n",
)

trainer.train()


# ✅ 10. Run inference on test set
FastLanguageModel.for_inference(model)

for i, question in enumerate(test_ds["text"][:5]):
    print(f"--- Test Example {i+1} ---")
    messages = [{"role": "user", "content": question}]
    input_ids = tokenizer.apply_chat_template(messages, tokenize=True, add_generation_prompt=True, return_tensors="pt").to("cuda")
    output = model.generate(input_ids=input_ids, max_new_tokens=64, temperature=0.7)
    print(tokenizer.decode(output[0], skip_special_tokens=True, clean_up_tokenization_spaces=True))
    print()


## Evaluate

In [None]:
# ✅ Install required packages
!pip install -q bert-score sentence-transformers evaluate

# ✅ Imports
import evaluate
from bert_score import score as bert_score
from sentence_transformers import SentenceTransformer, util
from tqdm import tqdm

# ✅ Load evaluation metrics
exact_match = evaluate.load("exact_match")
bleu = evaluate.load("bleu")

# ✅ Load SentenceTransformer model for cosine similarity
embedder = SentenceTransformer("all-MiniLM-L6-v2")

# ✅ Define predictions and ground truths from your test set
preds = []
truths = []

for example in test_ds:
    # Build chat message from test set
    messages = example["conversations"]
    input_ids = tokenizer.apply_chat_template(
        messages,
        tokenize=True,
        add_generation_prompt=True,
        return_tensors="pt"
    ).to("cuda")

    # Generate prediction
    output = model.generate(input_ids=input_ids, max_new_tokens=128)
    decoded = tokenizer.decode(output[0], skip_special_tokens=True)

    # Extract the assistant response
    if "<|assistant|>" in decoded:
        prediction = decoded.split("<|assistant|>")[-1].strip()
    else:
        prediction = decoded.strip()

    preds.append(prediction)

    # Get the ground truth answer from test set
    for turn in example["conversations"]:
        if turn["role"] == "assistant":
            truths.append(turn["content"].strip())
            break



# ✅ Compute Exact Match
em_result = exact_match.compute(predictions=preds, references=truths)

# ✅ Compute BLEU (over .2 is thought to be okay)
bleu_result = bleu.compute(predictions=preds, references=[[t] for t in truths])

# ✅ Compute BERTScore

P, R, F1 = bert_score(
    preds,
    truths,
    model_type="microsoft/deberta-xlarge-mnli",  # avoids Unsloth conflict
    lang="en",
    verbose=True
)

bertscore_result = {
    "precision": P.mean().item(),
    "recall": R.mean().item(),
    "f1": F1.mean().item()
}


# ✅ Compute cosine similarity using sentence embeddings
embedding_similarities = []
for pred, truth in zip(preds, truths):
    pred_emb = embedder.encode(pred, convert_to_tensor=True)
    truth_emb = embedder.encode(truth, convert_to_tensor=True)
    sim = util.cos_sim(pred_emb, truth_emb).item()
    embedding_similarities.append(sim)

cosine_result = {
    "average_cosine_similarity": sum(embedding_similarities) / len(embedding_similarities)
}

# ✅ Display results
print("📊 Evaluation Results:")
print(f"🔹 Exact Match: {em_result['exact_match']:.4f}")
print(f"🔹 BLEU Score: {bleu_result['bleu']:.4f}")
print(f"🔹 BERTScore F1: {bertscore_result['f1']:.4f}")
print(f"🔹 Cosine Similarity: {cosine_result['average_cosine_similarity']:.4f}")
