In [1]:
!pip install unsloth vllm

Collecting unsloth
  Downloading unsloth-2025.10.11-py3-none-any.whl.metadata (61 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/61.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.5/61.5 kB[0m [31m5.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting vllm
  Downloading vllm-0.11.0-cp38-abi3-manylinux1_x86_64.whl.metadata (17 kB)
Collecting unsloth_zoo>=2025.10.12 (from unsloth)
  Downloading unsloth_zoo-2025.10.12-py3-none-any.whl.metadata (32 kB)
Collecting tyro (from unsloth)
  Downloading tyro-0.9.35-py3-none-any.whl.metadata (12 kB)
Collecting xformers>=0.0.27.post2 (from unsloth)
  Downloading xformers-0.0.32.post2-cp39-abi3-manylinux_2_28_x86_64.whl.metadata (1.1 kB)
Collecting bitsandbytes!=0.46.0,!=0.48.0,>=0.45.5 (from unsloth)
  Downloading bitsandbytes-0.48.1-py3-none-manylinux_2_24_x86_64.whl.metadata (10 kB)
Collecting datasets!=4.0.*,!=4.1.0,>=3.4.1 (from unsloth)
  Downloading datasets-

In [1]:
from unsloth import FastModel
from datasets import load_dataset
import torch, re, random
from transformers import AutoTokenizer, AutoModelForCausalLM


🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.




INFO 10-28 15:22:45 [__init__.py:216] Automatically detected platform cuda.
ERROR 10-28 15:22:47 [fa_utils.py:57] Cannot use FA version 2 is not supported due to FA2 is only supported on devices with compute capability >= 8
🦥 Unsloth Zoo will now patch everything to make training faster!


In [2]:
from tqdm import tqdm

In [3]:
# FastLanguageModel inference context (if using Unsloth)
try:
    from unsloth import FastLanguageModel
except Exception:
    pass


In [4]:
# Load the Gemma-3 1B Instruct model using FastModel (Unsloth integration for faster loading)
model, tokenizer = FastModel.from_pretrained(
    model_name = "unsloth/gemma-3-1b-it",   # Instruction-tuned Gemma-3 model (1B parameters)

    # max_seq_length = max_seq_length,      # Optional: set if you need longer input contexts (e.g., 8k–32k tokens)

    load_in_4bit = False,                   # Use 4-bit quantization to drastically reduce GPU memory (set True if limited VRAM)
    load_in_8bit = False,                   # Use 8-bit quantization (more accurate than 4-bit, but uses roughly 2× memory)

    full_finetuning = False,                # Set True only if you plan to fine-tune all model parameters
                                            # Here we keep False since we’re evaluating, not training

    # token = "hf_...",                     # Optional: required if the model is gated or private on Hugging Face Hub
)

# Load a reasoning-based multiple-choice QA dataset
# RACE = Reading comprehension dataset (middle/high school level) with context, question, and 4 answer options
# "test[:200]" limits the split to 200 samples for faster experimentation during the workshop
from datasets import load_dataset
ds = load_dataset("race", "all", split="test[:100]")

==((====))==  Unsloth 2025.10.11: Fast Gemma3 patching. Transformers: 4.57.1. vLLM: 0.11.0.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.8.0+cu126. CUDA: 7.5. CUDA Toolkit: 12.6. Triton: 3.4.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.32.post1. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
Unsloth: Using float16 precision for gemma3 won't work! Using float32.
Unsloth: Gemma3 does not support SDPA - switching to fast eager.
Unsloth: QLoRA and full finetuning all not selected. Switching to 16bit LoRA.


In [5]:
# Define the possible option letters for multiple-choice questions
LETTERS = "ABCD"
# Regular expression to extract the final predicted letter from the model's output.
# It looks for text like: "Final: A", "Final: b", etc. (case-insensitive)
LETTER_RE = re.compile(r"Final:\s*([A-D])\b", re.IGNORECASE)


def build_prompt(article, question, options):
    """
    Builds a reasoning-style multiple-choice prompt for the model.

    Parameters:
        article (str): The context or passage that the question refers to.
        question (str): The question to be answered.
        options (list of str): The list of possible answer choices (A, B, C, D).

    Returns:
        str: A formatted prompt instructing the model to reason step-by-step and
             end its answer with 'Final: <LETTER>'.
    """
    # Combine options into a labeled list like:
    # A) option1
    # B) option2
    opts = "\n".join(f"{l}) {o}" for l, o in zip(LETTERS[:len(options)], options))

    # Full prompt with reasoning instruction and structured format
    return (
        "Read the passage and reason step-by-step before answering.\n\n"
        f"Passage:\n{article}\n\n"
        f"Question: {question}\n\n"
        f"Options:\n{opts}\n\n"
        "Respond with reasoning and finish with 'Final: <LETTER>'."
    )

# Match variants of: "Final: A" / "Final answer: b"
LETTER_RE = re.compile(r'(?i)\bfinal(?:\s*answer)?\s*:\s*([ABCD])\b')

def mc_reward_safe(outputs, golds, malformed_penalty=0.0):
    """
    outputs: str | list[str]
    golds:   str | list[str]
    malformed_penalty: float -> reward if no valid 'Final: <LETTER>' is present
    returns: list[float]
    """
    # Normalize to lists
    if isinstance(outputs, str):
        outputs = [outputs]
    if isinstance(golds, str):
        golds = [golds]

    # Tile/truncate golds to match outputs length
    if len(golds) != len(outputs):
        rep = math.ceil(len(outputs)/max(1,len(golds)))
        golds = (golds * rep)[:len(outputs)]

    rewards = []

    for out, gold in zip(outputs, golds):
        txt = "" if out is None else str(out)

        # ✅ Find ALL "Final" answers and pick the last
        matches = LETTER_RE.findall(txt)

        if not matches:
            rewards.append(float(malformed_penalty))
            continue

        pred = matches[-1].upper()   # last occurrence

        rewards.append(1.0 if pred == gold else 0.0)

    return rewards

In [6]:
model.device

device(type='cuda', index=0)

In [7]:
mc_reward_safe(
    outputs=["Final: A"],
    golds=["A"]
)

[1.0]

In [8]:
mc_reward_safe(
    outputs=["Final: B"],
    golds=["A"]
)

[0.0]

In [9]:
mc_reward_safe(
    outputs=["final: c"],
    golds=["C"]
)

[1.0]

In [10]:
mc_reward_safe(
    outputs=["After much thought... Final: D because..."],
    golds=["D"]
)

[1.0]

In [11]:
mc_reward_safe(
    outputs=["Final: E"],
    golds=["A"]
)

[0.0]

In [12]:
mc_reward_safe(
    outputs=["I think it is B."],
    golds=["B"]
)

[0.0]

In [13]:
mc_reward_safe(
    outputs=[""],
    golds=["A"]
)

[0.0]

In [14]:
mc_reward_safe(
    outputs=[None],
    golds=["A"]
)

[0.0]

In [15]:
mc_reward_safe(
    outputs=["just some text"],
    golds=["A"],
    malformed_penalty=-1.0
)

[-1.0]

In [16]:
mc_reward_safe(
    outputs=[
        "Final: A",
        "Final: B",
        "No answer here",
        "Final: C",
    ],
    golds=["A", "B", "C", "D"]
)

[1.0, 1.0, 0.0, 0.0]

In [18]:
mc_reward_safe(
    outputs=["Final: D, Final A, Final: A"],
    golds=["A"]
)

[1.0]

In [19]:
model.eval()
torch.set_grad_enabled(False)
if tokenizer.pad_token_id is None:
    tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "left"      # safer for causal LMs

# 1) Build prompt/answer columns once
def to_prompt(batch):
    return {
        "prompt": [build_prompt(a, q, opts) for a, q, opts in zip(batch["article"], batch["question"], batch["options"])],
        "answer": batch["answer"],
    }

# ds = load_dataset("race", "all", split="test[:200]")  # example split
ds = ds.map(to_prompt, batched=True)

# 2) Batched generation via map

# Match variants of: "Final: A" / "Final answer: b"
LETTER_RE = re.compile(r'(?i)\bfinal(?:\s*answer)?\s*:\s*([ABCD])\b')

GEN_KW = dict(
    max_new_tokens=1000,        # <-- HARD STOP (don’t remove)
    do_sample=False,
    temperature=0.0,
    top_p=1.0,
    eos_token_id=tokenizer.eos_token_id,
    pad_token_id=tokenizer.pad_token_id,
)

BATCH_SIZE = 1

def generate_and_score(batch):
    inputs = tokenizer(
        batch["prompt"],
        return_tensors="pt",
        padding=True,
        truncation=True,
        max_length=2048,       # truncate long contexts
    )
    inputs = {k: v.to(model.device) for k, v in inputs.items()}

    with torch.no_grad():
        outputs = model.generate(**inputs, **GEN_KW)

    texts = tokenizer.batch_decode(outputs, skip_special_tokens=True)
    rewards = [mc_reward_safe(t, g) for t, g in zip(texts, batch["answer"])]
    return {"pred_text": texts, "reward": rewards}

Map:   0%|          | 0/200 [00:00<?, ? examples/s]

In [20]:
ds[0]

{'example_id': 'high19432.txt',
 'article': 'The rain had continued for a week and the flood had created a big river which were running by Nancy Brown\'s farm. As she tried to gather her cows to a higher ground, she slipped and hit her head on a fallen tree trunk. The fall made her unconscious for a moment or two. When she came to, Lizzie, one of her oldest and favorite cows, was licking her face. \nAt that time, the water level on the farm was still rising. Nancy gathered all her strength to get up and began walking slowly with Lizzie. The rain had become much heavier, and the water in the field was now waist high. Nancy\'s pace got slower and slower because she felt a great pain in her head. Finally, all she could do was to throw her arm around Lizzie\'s neck and try to hang on. About 20 minutes later, Lizzie managed to pull herself and Nancy out of the rising water and onto a bit of high land, which seemed like a small island in the middle of a lake of white water. \nEven though it 

In [21]:
pred_ds = ds.map(generate_and_score, batched=True, batch_size=BATCH_SIZE)

Map:   0%|          | 0/200 [00:00<?, ? examples/s]