In [None]:
# Cell 1: Imports
import pandas as pd                                      # for data loading and manipulation :contentReference[oaicite:2]{index=2}
import torch                                             # for tensor/device management
from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM  # for model inference :contentReference[oaicite:3]{index=3}
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report  # for evaluation metrics :contentReference[oaicite:4]{index=4}
from sklearn.metrics import top_k_accuracy_score        # for Top-K accuracy :contentReference[oaicite:5]{index=5}
from sklearn.metrics import ConfusionMatrixDisplay      # for plotting confusion matrix :contentReference[oaicite:6]{index=6}
from tqdm.notebook import tqdm                           # for progress bar display
import matplotlib.pyplot as plt                          # for visualization


In [None]:
# Cell 2: Configuration
base_model_id        = "meta-llama/Llama-2-3b-hf"           # Hugging Face ID of base model
finetuned_model_dir  = "outputs"                           # Path to locally fine-tuned model
validation_csv       = "data/processed/symptom-disease-validation-dataset.csv"  # Validation data path
max_gen_length       = 128                                 # Max tokens for generation
num_return_sequences = 1                                   # Return only the top sequence
device               = 0 if torch.cuda.is_available() else -1  # Use GPU if available :contentReference[oaicite:7]{index=7}


In [None]:
# Cell 3: Load data
val_df = pd.read_csv(validation_csv)                        # Read CSV into DataFrame :contentReference[oaicite:8]{index=8}
val_df.head()

In [None]:
# Cell 4: Model + Pipeline setup
# Base model pipeline
base_tokenizer = AutoTokenizer.from_pretrained(base_model_id)
base_model     = AutoModelForCausalLM.from_pretrained(base_model_id, device_map="auto")
base_pipe      = pipeline(
    "text-generation",
    model=base_model,
    tokenizer=base_tokenizer,
    device=device,
    max_length=max_gen_length,
    num_return_sequences=num_return_sequences,
    pad_token_id=base_tokenizer.eos_token_id
)

# Fine-tuned model pipeline
ft_tokenizer = AutoTokenizer.from_pretrained(finetuned_model_dir)
ft_model     = AutoModelForCausalLM.from_pretrained(finetuned_model_dir, device_map="auto")
ft_pipe      = pipeline(
    "text-generation",
    model=ft_model,
    tokenizer=ft_tokenizer,
    device=device,
    max_length=max_gen_length,
    num_return_sequences=num_return_sequences,
    pad_token_id=ft_tokenizer.eos_token_id
)

In [None]:
# Cell 5: Inference loop
results = []
for idx, row in tqdm(val_df.iterrows(), total=len(val_df), desc="Running inference"):  # tqdm progress bar :contentReference[oaicite:9]{index=9}
    symptoms   = row["text"]
    true_label = row["label"]

    # Format prompt as chat instruction
    prompt = f"[s][INST] Symptoms: {symptoms} [/INST]"

    # Base model prediction
    base_out = base_pipe(prompt, do_sample=False)[0]["generated_text"]
    base_pred = base_out.split("[/INST]")[-1].strip()

    # Fine-tuned model prediction
    ft_out = ft_pipe(prompt, do_sample=False)[0]["generated_text"]
    ft_pred = ft_out.split("[/INST]")[-1].strip()

    results.append({
        "symptoms": symptoms,
        "true":     true_label,
        "base_pred": base_pred,
        "ft_pred":   ft_pred
    })

comp_df = pd.DataFrame(results)

In [None]:
# Cell 6: Quantitative evaluation
# Exact-match accuracy
base_acc = accuracy_score(comp_df["true"], comp_df["base_pred"])
ft_acc   = accuracy_score(comp_df["true"], comp_df["ft_pred"])
print(f"Base Model Accuracy:      {base_acc:.2%}")           # accuracy_score :contentReference[oaicite:10]{index=10}
print(f"Fine-tuned Model Accuracy:{ft_acc:.2%}")

# Top-1 accuracy (indices coding)
labels = list(comp_df["true"].unique())
label2idx = {lbl: i for i, lbl in enumerate(labels)}
y_true = [label2idx[t] for t in comp_df["true"]]
y_base = [label2idx.get(p, -1) for p in comp_df["base_pred"]]
y_ft   = [label2idx.get(p, -1) for p in comp_df["ft_pred"]]
print("Base Top-1 Accuracy:", top_k_accuracy_score(y_true, [y_base], k=1))  # top_k_accuracy_score :contentReference[oaicite:11]{index=11}
print("Fine-tuned Top-1 Accuracy:", top_k_accuracy_score(y_true, [y_ft], k=1))

# Classification report for fine-tuned model
print("\nFine-tuned Model Classification Report:")
print(classification_report(y_true, y_ft, target_names=labels))            # classification_report :contentReference[oaicite:12]{index=12}

In [None]:
# Cell 7: Confusion matrix plot
cm = confusion_matrix(y_true, y_ft)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=labels)
fig, ax = plt.subplots(figsize=(8, 8))
disp.plot(cmap="Blues", ax=ax)
plt.title("Fine-tuned Model Confusion Matrix")
plt.show()                                                                 # visual using ConfusionMatrixDisplay :contentReference[oaicite:13]{index=13}

In [None]:
# Cell 8: Save comparison dataframe
comp_df.to_csv("outputs/model_comparison.csv", index=False)                # DataFrame.to_csv :contentReference[oaicite:14]{index=14}
print("Saved comparison results to outputs/model_comparison.csv")