In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
model_name = "Llama-3.2-3B-Instruct-FineTome5K"
LORA_DIR = f"/content/drive/MyDrive/models/lora/{model_name}"
BASE_MODEL_ID = "unsloth/Llama-3.2-3B-Instruct"

In [3]:
import torch
MAX_SEQ_LEN = 2048
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

In [5]:
!pip install "lm-eval>=0.4.0" transformers accelerate peft bitsandbytes -q

  Preparing metadata (setup.py) ... [?25l[?25hdone
  Preparing metadata (setup.py) ... [?25l[?25hdone
  Preparing metadata (setup.py) ... [?25l[?25hdone
  Building wheel for rouge-score (setup.py) ... [?25l[?25hdone
  Building wheel for sqlitedict (setup.py) ... [?25l[?25hdone
  Building wheel for word2number (setup.py) ... [?25l[?25hdone


Evaluate on light benchmarks

In [None]:
from lm_eval import simple_evaluate

def run_base_benchmark():
    # Can add multiple benchmark sets here, e.g. sciq, HellaSwag etc
    tasks = ["arc_challenge"]

    print("Running benchmark eval for BASE model...")
    base_results = simple_evaluate(
        model="hf",
        model_args=(
            f"pretrained={BASE_MODEL_ID},"
            f"dtype=bfloat16,"
            f"load_in_4bit=True"
        ),
        tasks=",".join(tasks),
        batch_size=2,
        device=DEVICE,
        apply_chat_template=True,
    )
    return base_results

In [None]:
def run_ft_benchmark():
    # Can add multiple benchmark sets here, e.g. sciq, HellaSwag etc
    tasks = ["arc_challenge"]

    print("Running benchmark eval for FINETUNED (LoRA) model...")
    ft_results = simple_evaluate(
        model="hf",
        model_args=(
            f"pretrained={BASE_MODEL_ID},"
            f"peft={LORA_DIR},"
            f"dtype=bfloat16,"
            f"load_in_4bit=True"
        ),
      tasks=",".join(tasks),
      batch_size=2,
      device=DEVICE,
      apply_chat_template=True,
    )
    return ft_results

In [16]:
base_results = run_base_benchmark()

Running benchmark eval for BASE model...


The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

100%|██████████| 1172/1172 [00:01<00:00, 971.53it/s]
Running loglikelihood requests: 100%|██████████| 4687/4687 [21:21<00:00,  3.66it/s]


In [20]:
ft_results = run_ft_benchmark()

Running benchmark eval for FINETUNED (LoRA) model...


The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

100%|██████████| 1172/1172 [00:01<00:00, 929.33it/s]
Running loglikelihood requests: 100%|██████████| 4687/4687 [21:47<00:00,  3.58it/s]


In [21]:
def summarize(results, label):
    print(f"\n==== {label} ====")
    task_results = results.get("results", {})

    for task, metrics in task_results.items():
        print(f"{task}:")
        if not isinstance(metrics, dict):
            print("  (unexpected metrics format)", metrics)
            continue

        for metric_name, metric_val in metrics.items():
            # Case 1: dict with "mean" or "score"
            if isinstance(metric_val, dict):
                if "mean" in metric_val:
                    val_str = f"{metric_val['mean']:.3f}"
                elif "score" in metric_val:
                    val_str = f"{metric_val['score']:.3f}"
                else:
                    val_str = str(metric_val)

            # Case 2: plain numeric
            elif isinstance(metric_val, (int, float)):
                val_str = f"{metric_val:.3f}"

            # Case 3: everything else (string / weird)
            else:
                val_str = str(metric_val)

            print(f"  {metric_name:20s} = {val_str}")
summarize(base_results, "BASE")
summarize(ft_results, "FINETUNED")


==== BASE ====
arc_challenge:
  alias                = arc_challenge
  acc,none             = 0.412
  acc_stderr,none      = 0.014
  acc_norm,none        = 0.430
  acc_norm_stderr,none = 0.014

==== FINETUNED ====
arc_challenge:
  alias                = arc_challenge
  acc,none             = 0.396
  acc_stderr,none      = 0.014
  acc_norm,none        = 0.420
  acc_norm_stderr,none = 0.014


We see that finetuning hurt performance slightly on the "ai2_arc" benchmark. https://huggingface.co/datasets/allenai/ai2_arc

This is expected since we've finetuned on FineTome which targets getting better at explaination rather than reasoning (which would be beneficial for "ai2_arc").