In [5]:
!pip install unsloth vllm

Collecting unsloth
  Downloading unsloth-2025.10.3-py3-none-any.whl.metadata (59 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/59.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m59.1/59.1 kB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting vllm
  Downloading vllm-0.11.0-cp38-abi3-manylinux1_x86_64.whl.metadata (17 kB)
Collecting unsloth_zoo>=2025.10.2 (from unsloth)
  Downloading unsloth_zoo-2025.10.3-py3-none-any.whl.metadata (31 kB)
Collecting tyro (from unsloth)
  Downloading tyro-0.9.35-py3-none-any.whl.metadata (12 kB)
Collecting xformers>=0.0.27.post2 (from unsloth)
  Downloading xformers-0.0.32.post2-cp39-abi3-manylinux_2_28_x86_64.whl.metadata (1.1 kB)
Collecting bitsandbytes!=0.46.0,!=0.48.0,>=0.45.5 (from unsloth)
  Downloading bitsandbytes-0.48.1-py3-none-manylinux_2_24_x86_64.whl.metadata (10 kB)
Collecting datasets!=4.0.*,!=4.1.0,>=3.4.1 (from unsloth)
  Downloading datasets-4.2

In [1]:
from unsloth import FastModel
from datasets import load_dataset
import torch, re, random
from transformers import AutoTokenizer, AutoModelForCausalLM

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
INFO 10-14 19:55:05 [__init__.py:216] Automatically detected platform cuda.
ERROR 10-14 19:55:06 [fa_utils.py:57] Cannot use FA version 2 is not supported due to FA2 is only supported on devices with compute capability >= 8
🦥 Unsloth Zoo will now patch everything to make training faster!


In [21]:
# Load the Gemma-3 1B Instruct model using FastModel (Unsloth integration for faster loading)
model, tokenizer = FastModel.from_pretrained(
    model_name = "unsloth/gemma-3-1b-it",   # Instruction-tuned Gemma-3 model (1B parameters)

    # max_seq_length = max_seq_length,      # Optional: set if you need longer input contexts (e.g., 8k–32k tokens)

    load_in_4bit = False,                   # Use 4-bit quantization to drastically reduce GPU memory (set True if limited VRAM)
    load_in_8bit = False,                   # Use 8-bit quantization (more accurate than 4-bit, but uses roughly 2× memory)

    full_finetuning = False,                # Set True only if you plan to fine-tune all model parameters
                                            # Here we keep False since we’re evaluating, not training

    # token = "hf_...",                     # Optional: required if the model is gated or private on Hugging Face Hub
)

# Load a reasoning-based multiple-choice QA dataset
# RACE = Reading comprehension dataset (middle/high school level) with context, question, and 4 answer options
# "test[:200]" limits the split to 200 samples for faster experimentation during the workshop
from datasets import load_dataset
ds = load_dataset("race", "all", split="test[:200]")

==((====))==  Unsloth 2025.10.3: Fast Gemma3 patching. Transformers: 4.56.2. vLLM: 0.11.0.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.8.0+cu126. CUDA: 7.5. CUDA Toolkit: 12.6. Triton: 3.4.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.32.post1. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
Unsloth: Using float16 precision for gemma3 won't work! Using float32.
Unsloth: Gemma3 does not support SDPA - switching to fast eager.
Unsloth: QLoRA and full finetuning all not selected. Switching to 16bit LoRA.


In [22]:
# Define the possible option letters for multiple-choice questions
LETTERS = "ABCD"

# Regular expression to extract the final predicted letter from the model's output.
# It looks for text like: "Final: A", "Final: b", etc. (case-insensitive)
LETTER_RE = re.compile(r"Final:\s*([A-D])\b", re.IGNORECASE)


def build_prompt(article, question, options):
    """
    Builds a reasoning-style multiple-choice prompt for the model.

    Parameters:
        article (str): The context or passage that the question refers to.
        question (str): The question to be answered.
        options (list of str): The list of possible answer choices (A, B, C, D).

    Returns:
        str: A formatted prompt instructing the model to reason step-by-step and
             end its answer with 'Final: <LETTER>'.
    """
    # Combine options into a labeled list like:
    # A) option1
    # B) option2
    opts = "\n".join(f"{l}) {o}" for l, o in zip(LETTERS[:len(options)], options))

    # Full prompt with reasoning instruction and structured format
    return (
        "Read the passage and reason step-by-step before answering.\n\n"
        f"Passage:\n{article}\n\n"
        f"Question: {question}\n\n"
        f"Options:\n{opts}\n\n"
        "Respond with reasoning and finish with 'Final: <LETTER>'."
    )


def mc_reward(output, gold):
    """
    Evaluates the model's multiple-choice answer.

    Parameters:
        output (str): The text generated by the model.
        gold (str): The correct answer letter (e.g., 'A', 'B', 'C', or 'D').

    Returns:
        float: 1.0 if the model’s final answer matches the gold answer, else 0.0.
    """
    # Search for the model’s final selected letter using the regex defined above
    m = LETTER_RE.search(output)

    # If a match is found, compare it to the gold answer (case-insensitive)
    return float(m and m.group(1).upper() == gold)

In [27]:
# Initialize counters for correct predictions and total samples
correct, total = 0, 0

# Iterate over each example (context, question, options, answer) in the dataset
for ex in ds:
    # 1️⃣ Build a reasoning-style prompt for the model
    prompt = build_prompt(ex["article"], ex["question"], ex["options"])

    # 2️⃣ Tokenize the prompt and move tensors to the model's device (CPU/GPU)
    input_ids = tokenizer(prompt, return_tensors="pt").to(model.device)

    # 3️⃣ Generate a response without computing gradients (inference only)
    with torch.no_grad():
        output_ids = model.generate(**input_ids)

    # 4️⃣ Decode the generated tokens back into text
    out = tokenizer.decode(output_ids[0], skip_special_tokens=True)

    # 5️⃣ Evaluate the model’s answer using the reward function (1 = correct, 0 = wrong)
    reward = mc_reward(out, ex["answer"])

    # 6️⃣ Update counters
    correct += reward
    total += 1

    # 7️⃣ Print intermediate progress every 20 samples
    if total % 20 == 0:
        print(f"Progress: {total} samples → accuracy {correct/total:.3f}")

    # ⚠️ Temporary break for debugging (remove to evaluate full dataset)
    break

# 8️⃣ Display the final accuracy after evaluation
print(f"\nFinal accuracy on {total} test items: {correct/total:.3f}")


Final accuracy on 1 test items: 0.000
