In [1]:
!pip install unsloth vllm

Collecting unsloth
  Downloading unsloth-2025.10.11-py3-none-any.whl.metadata (61 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/61.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.5/61.5 kB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting vllm
  Downloading vllm-0.11.0-cp38-abi3-manylinux1_x86_64.whl.metadata (17 kB)
Collecting unsloth_zoo>=2025.10.12 (from unsloth)
  Downloading unsloth_zoo-2025.10.12-py3-none-any.whl.metadata (32 kB)
Collecting tyro (from unsloth)
  Downloading tyro-0.9.35-py3-none-any.whl.metadata (12 kB)
Collecting xformers>=0.0.27.post2 (from unsloth)
  Downloading xformers-0.0.32.post2-cp39-abi3-manylinux_2_28_x86_64.whl.metadata (1.1 kB)
Collecting bitsandbytes!=0.46.0,!=0.48.0,>=0.45.5 (from unsloth)
  Downloading bitsandbytes-0.48.2-py3-none-manylinux_2_24_x86_64.whl.metadata (10 kB)
Collecting datasets!=4.0.*,!=4.1.0,>=3.4.1 (from unsloth)
  Downloading datasets-

In [1]:
from unsloth import FastModel
from datasets import load_dataset
import torch, re, random
from transformers import AutoTokenizer, AutoModelForCausalLM


🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.




INFO 10-30 12:44:13 [__init__.py:216] Automatically detected platform cuda.
ERROR 10-30 12:44:13 [fa_utils.py:57] Cannot use FA version 2 is not supported due to FA2 is only supported on devices with compute capability >= 8
🦥 Unsloth Zoo will now patch everything to make training faster!


In [2]:
from tqdm import tqdm

In [3]:
# FastLanguageModel inference context (if using Unsloth)
try:
    from unsloth import FastLanguageModel
except Exception:
    pass


In [4]:
# Load the Gemma-3 1B Instruct model using FastModel (Unsloth integration for faster loading)
model, tokenizer = FastModel.from_pretrained(
    model_name = "unsloth/gemma-3-1b-it",   # Instruction-tuned Gemma-3 model (1B parameters)

    # max_seq_length = max_seq_length,      # Optional: set if you need longer input contexts (e.g., 8k–32k tokens)

    load_in_4bit = False,                   # Use 4-bit quantization to drastically reduce GPU memory (set True if limited VRAM)
    load_in_8bit = False,                   # Use 8-bit quantization (more accurate than 4-bit, but uses roughly 2× memory)

    full_finetuning = False,                # Set True only if you plan to fine-tune all model parameters
                                            # Here we keep False since we’re evaluating, not training

    # token = "hf_...",                     # Optional: required if the model is gated or private on Hugging Face Hub
)

# Load a reasoning-based multiple-choice QA dataset
# RACE = Reading comprehension dataset (middle/high school level) with context, question, and 4 answer options
# "test[:200]" limits the split to 200 samples for faster experimentation during the workshop
from datasets import load_dataset
ds = load_dataset("race", "all", split="test[:10]")

==((====))==  Unsloth 2025.10.11: Fast Gemma3 patching. Transformers: 4.57.1. vLLM: 0.11.0.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.8.0+cu126. CUDA: 7.5. CUDA Toolkit: 12.6. Triton: 3.4.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.32.post1. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
Unsloth: Using float16 precision for gemma3 won't work! Using float32.
Unsloth: Gemma3 does not support SDPA - switching to fast eager.
Unsloth: QLoRA and full finetuning all not selected. Switching to 16bit LoRA.


model.safetensors:   0%|          | 0.00/2.00G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/233 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.model:   0%|          | 0.00/4.69M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/33.4M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/35.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/670 [00:00<?, ?B/s]

chat_template.jinja: 0.00B [00:00, ?B/s]

README.md: 0.00B [00:00, ?B/s]

all/test-00000-of-00001.parquet:   0%|          | 0.00/2.08M [00:00<?, ?B/s]

all/train-00000-of-00001.parquet:   0%|          | 0.00/37.4M [00:00<?, ?B/s]

all/validation-00000-of-00001.parquet:   0%|          | 0.00/2.05M [00:00<?, ?B/s]

Generating test split:   0%|          | 0/4934 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/87866 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/4887 [00:00<?, ? examples/s]

In [6]:
import math
def build_prompt(article, question, options):
    LETTERS = "ABCD"
    opts = "\n".join(f"{l}) {o}" for l, o in zip(LETTERS[:len(options)], options))
    return (
        "Read the passage and reason step-by-step before answering.\n\n"
        f"Passage:\n{article}\n\n"
        f"Question: {question}\n\n"
        f"Options:\n{opts}\n\n"
        "Output EXACTLY two lines in this order:\n"
        "Reasoning: <your concise reasoning>\n"
        "Final: <LETTER>\n"
        "After the 'Final' line, output NOTHING else."
    )

# Match variants of: "Final: A" / "Final answer: b"
LETTER_RE = re.compile(r'(?i)\bfinal(?:\s*answer)?\s*:\s*([ABCD])\b')

def mc_reward_grpo(completions, answer=None, **kwargs):
    """
    completions: list[str] OR list[list[{'role','content'}]] OR list[dict(content=...)]
    answer:     str | list[str] | None   (gold letters)
    kwargs:     malformed_penalty (float), optional
    returns:    list[float]  (one per completion)
    """
    penalty = float(kwargs.get("malformed_penalty", 0.0))

    # normalize completions -> texts (one string per completion)
    texts = []
    for c in completions or []:
        if isinstance(c, list) and c and isinstance(c[-1], dict):
            texts.append(str(c[-1].get("content", "")))
        elif isinstance(c, dict):
            texts.append(str(c.get("content", "")))
        else:
            texts.append("" if c is None else str(c))

    n = len(texts)

    # normalize/tile answers -> golds (length n)
    if answer is None:
        golds = [None] * n
    else:
        golds = [answer] if isinstance(answer, str) else list(answer)
        if len(golds) != n:
            golds = (golds * math.ceil(n / max(1, len(golds))))[:n]

    # score: last 'Final: <LETTER>' wins
    rewards = []
    for txt, g in zip(texts, golds):
        m = LETTER_RE.findall(txt or "")
        if not m:
            rewards.append(penalty)
            continue
        pred = m[-1].upper()
        rewards.append(1.0 if (g is not None and pred == str(g).upper()) else (1.0 if g is None else 0.0))
    return rewards


In [7]:
mc_reward_grpo(
    completions=["Final: A"],
    answer=["A"]
)

[1.0]

In [8]:
mc_reward_grpo(
    completions=["Final: B"],
    answer=["A"]
)

[0.0]

In [9]:
mc_reward_grpo(
    completions=["final: c"],
    answer=["C"]
)

[1.0]

In [10]:
mc_reward_grpo(
    completions=["After much thought... Final: D because..."],
    answer=["D"]
)

[1.0]

In [11]:
mc_reward_grpo(
    completions=["Final: E"],
    answer=["A"]
)

[0.0]

In [12]:
mc_reward_grpo(
    completions=["I think it is B."],
    answer=["B"]
)

[0.0]

In [13]:
mc_reward_grpo(
    completions=[""],
    answer=["A"]
)

[0.0]

In [14]:
mc_reward_grpo(
    completions=[None],
    answer=["A"]
)

[0.0]

In [15]:
mc_reward_grpo(
    completions=["just some text", "Final:A"],
    answer=["A"],
    malformed_penalty=-1.0
)

[-1.0, 1.0]

In [16]:
mc_reward_grpo(
    completions=[
        "Final: A",
        "Final: B",
        "No answer here",
        "Final: C",
    ],
    answer=["A", "B", "C", "D"]
)

[1.0, 1.0, 0.0, 0.0]

In [17]:
mc_reward_grpo(
    completions=["Final: D, Final A, Final: A"],
    answer=["A"]
)

[1.0]

In [18]:
i = 9
row = ds[i]  # expects keys: 'article', 'question', 'options', 'answer' (A/B/C/D)
article  = row["article"]
question = row["question"]
options  = row["options"]   # list[str], e.g. 4 items
gold     = row["answer"]    # 'A' | 'B' | 'C' | 'D'

# --- 2) Build the prompt (uses your build_prompt) ---
prompt = build_prompt(article, question, options)


In [19]:
prompt

'Read the passage and reason step-by-step before answering.\n\nPassage:\nLittle Tommy was doing very badly in math. His parents had tried everything--tutors, cards, special learning centers--in short, everything they could think of. Finally they took Tommy to a catholic  school.\nAfter the first day, little Tommy came home with a very serious look on his face. He didn\'t kiss his mother hello. Instead, he went straight to his room and started studying. Books and papers were spread out all over the room and little Tommy was hard at work. His mother was surprised. She called him down to dinner and as soon as he finished eating, he went back to his room, without a word. In no time he was back hitting the books as hard as before. This went on for some time, day after day while the mother tried to understand what was happening.\nFinally, little Tommy brought home his report card. He quietly put it on the table and went up to his room and hit the books. His mom looked at it and to her surpri

In [20]:
device = next(model.parameters()).device
enc = tokenizer(
    prompt,
    return_tensors="pt",
    truncation=True,
    max_length=1024,   # keep room for completion if you also cap total length
).to(device)

with torch.no_grad():
    gen_ids = model.generate(
        **enc,
        max_new_tokens=128,        # completion length (keep it small for MC)
        do_sample=False,           # deterministic; set True + temperature for sampling
        eos_token_id=tokenizer.eos_token_id,
        pad_token_id=tokenizer.eos_token_id,
    )

out_text = tokenizer.decode(gen_ids[0][enc["input_ids"].shape[1]:], skip_special_tokens=True)

# --- 4) Extract, score, and print ---
reward = mc_reward_grpo([out_text], answer=gold)[0]

In [80]:
out_text

'\n\nReasoning: The passage describes a sudden and unexpected success for Tommy, despite his struggles with math. The fact that he immediately recognized the man nailed to the plus sign as a sign of deception suggests a significant and potentially troubling event. This contradicts the idea that mistakes are always bad and that learning is always easy. The passage implies that something unusual and potentially negative happened, leading to a positive outcome.\n\nFinal: C\n'

In [21]:
reward = mc_reward_grpo([out_text], answer=gold)[0]
print(reward)

0.0


In [22]:
gold

'B'

In [83]:
m = LETTER_RE.findall(out_text or "")
print(m)

['C']


In [118]:
# 1) Build prompt/answer columns once
def to_prompt(batch):
    return {
        "prompt": [build_prompt(a, q, opts) for a, q, opts in zip(batch["article"], batch["question"], batch["options"])],
        "answer": batch["answer"],
    }

# ds = load_dataset("race", "all", split="test[:200]")  # example split
ds = ds.map(to_prompt, batched=True, batch_size = 1)

# 2) Batched generation via map

# Match variants of: "Final: A" / "Final answer: b"
LETTER_RE = re.compile(r'(?i)\bfinal(?:\s*answer)?\s*:\s*([ABCD])\b')

GEN_KW = dict(
    max_new_tokens=128,        # <-- HARD STOP (don’t remove)
    do_sample=False,
    eos_token_id=tokenizer.eos_token_id,
    pad_token_id=tokenizer.pad_token_id,
)

BATCH_SIZE = 1

def generate_and_score(batch):
    inputs = tokenizer(
        batch["prompt"],
        return_tensors="pt",
        padding=True,
        truncation=True,
        max_length=1024,
    )
    inputs = {k: v.to(model.device) for k, v in inputs.items()}

    with torch.no_grad():
        outputs = model.generate(**inputs, **GEN_KW)

    # Decode ONLY the generated continuation (clearer for the regex)
    gen_only = outputs[:, inputs["input_ids"].shape[1]:]
    texts = tokenizer.batch_decode(gen_only, skip_special_tokens=True)

    # --- normalize answers to list with same length as texts ---
    answers = batch["answer"]
    if isinstance(answers, str):
        answers = [answers] * len(texts)
    else:
        # ensure length matches (tile/trim defensively)
        if len(answers) != len(texts):
            rep = (len(texts) + max(1, len(answers)) - 1) // max(1, len(answers))
            answers = (list(answers) * rep)[:len(texts)]

    # --- call reward correctly: list in, scalar out ---
    rewards = [mc_reward_grpo([t], answer=a)[0] for t, a in zip(texts, answers)]

    # (Optional) sanity checks while you debug:
    print(texts, len(texts), len(answers), len(rewards))
    # for i,(t,a,r) in enumerate(zip(texts, answers, rewards)[:5]): print(i, a, r, t[:120])

    return {"pred_text": texts, "reward": rewards}

Map:   0%|          | 0/10 [00:00<?, ? examples/s]

In [119]:
pred_ds = ds.map(generate_and_score, batched=False)

Map:   0%|          | 0/10 [00:00<?, ? examples/s]

['\n\nReasoning: The passage states that Nancy tried to gather her cows to a higher ground.\nFinal: B\n\n'] 1 1 1
['\n\nReasoning: The passage states that the rain was heavy and the water level was rising, and that rescuers took more than two hours to discover Nancy. This indicates that the weather was difficult and hindered rescue efforts. Option B is incorrect because the passage states that the rain was "much heavier" and the water was "waist high."\nFinal: B\n\nReasoning: The passage describes Nancy\'s struggle to get out of the flood and the subsequent rescue. It doesn\'t mention that the rescuers took more than two hours to discover Nancy. The passage states that rescuers took more than two hours to discover Nancy, and that it'] 1 1 1
['\n\nReasoning: The passage states that the local people "set up an emergency shelter" for the cows.\nFinal: A\n\n'] 1 1 1
['\n\nReasoning: The passage states that the dress of an individual is a "sign language" that communicates a set of informati

In [117]:
for i in range(10):
  print(pred_ds[i]["reward"])

[0.0]
[0.0]
[1.0]
[1.0]
[1.0]
[0.0]
[1.0]
[0.0]
[1.0]
[0.0]
