In [49]:
%pip -q install "transformers>=4.44.0" "datasets>=2.20.0" "accelerate>=0.33.0" bitsandbytes trl peft sentencepiece


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Note: you may need to restart the kernel to use updated packages.


In [50]:
import torch, random, re
from datasets import load_dataset, Dataset
from transformers import AutoTokenizer, AutoModelForCausalLM
from trl import GRPOTrainer, GRPOConfig

In [51]:
import transformers, trl
print("transformers:", transformers.__version__)
print("trl         :", trl.__version__)
print("torch       :", torch.__version__)
print("CUDA        :", torch.cuda.is_available())

transformers: 4.57.1
trl         : 0.25.0
torch       : 2.7.1
CUDA        : False


In [52]:
raw = load_dataset("gsm8k", "main", split="train[:1000]")
def to_prompt(r):
    q = r["question"].strip()
    a = r["answer"].split("####")[-1].strip()
    return {"prompt": f"Solve step-by-step, then give final answer after 'Final Answer:'.\n\n{q}", "gold": a}
ds = raw.map(to_prompt, remove_columns=raw.column_names)

In [53]:
# Disable HF progress bars to avoid traitlets/layout/contextvar errors
import os
os.environ["HF_HUB_DISABLE_PROGRESS_BARS"] = "1"
os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "0"  # keep the fast-downloader off on this kernel

from huggingface_hub.utils import disable_progress_bars
disable_progress_bars()

from huggingface_hub import snapshot_download

MODEL_ID  = "HuggingFaceTB/SmolLM2-135M"
CACHE_DIR = "./_hf_cache_colab4"

local_model_path = snapshot_download(
    repo_id=MODEL_ID,
    local_dir=CACHE_DIR,
    allow_patterns=["*.safetensors","*.bin","*.json","*.model","tokenizer*","*merges*"],
    resume_download=True,
    max_workers=8,
)
print("Downloaded to:", local_model_path)


Downloaded to: /Users/keerthana/Keerthana/workspace/unsloth/_hf_cache_colab4




In [54]:
%pip install "bitsandbytes==0.42.0"

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


In [55]:
tokenizer = AutoTokenizer.from_pretrained(local_model_path, use_fast=True)

# Set up proper padding configuration
tokenizer.pad_token = "[PAD]"
tokenizer.padding_side = "left"  # Left padding is often better for casual LM

# Ensure model knows about the pad token
model = AutoModelForCausalLM.from_pretrained(local_model_path, device_map="auto")
if model.config.pad_token_id is None:
    model.config.pad_token_id = tokenizer.pad_token_id
    # Resize embeddings if needed
    if len(tokenizer) > model.get_input_embeddings().weight.shape[0]:
        model.resize_token_embeddings(len(tokenizer))


In [56]:
# Reward function: compare extracted final number with gold
def extract_final(text):
    m = re.findall(r"Final Answer:\s*([^\n]+)", text)
    return m[-1].strip() if m else None

def reward_fn(samples, prompts, golds):
    rewards = []
    for s,g in zip(samples, golds):
        pred = extract_final(s)
        rewards.append(1.0 if pred and pred == g else 0.0)
    return torch.tensor(rewards)

In [60]:
# Configure GRPO for proper padding handling
cfg = GRPOConfig(
    output_dir="smollm2-grpo",
    per_device_train_batch_size=2,
    gradient_accumulation_steps=8,
    num_generations=2,
    learning_rate=1e-5,
    num_train_epochs=1,
    fp16=False,
    bf16=False,
    logging_steps=10,
    save_steps=200,
    report_to="none",
    optim="adamw_torch",
    generation_kwargs={           # Explicit generation settings
        "max_new_tokens": 256,
        "do_sample": True,
        "temperature": 0.5,
        "pad_token_id": tokenizer.pad_token_id,
        "eos_token_id": tokenizer.eos_token_id,
    },
)

# Enable memory optimizations
model.gradient_checkpointing_enable()

trainer = GRPOTrainer(
    model=model,
    args=cfg,
    processing_class=tokenizer,  # Use tokenizer directly
    reward_funcs=[reward_fn],  # Updated parameter name
    train_dataset=ds,
)

# Enable gradient computation for inputs (needed for training)
trainer.model.enable_input_require_grads()
trainer.train()

The model is already on multiple devices. Skipping the move to device specified in `args`.
The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'pad_token_id': 0}.
The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'pad_token_id': 0}.


RuntimeError: probability tensor contains either `inf`, `nan` or element < 0

In [None]:
pip show torch

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Name: torch
Version: 2.7.1
Summary: Tensors and Dynamic neural networks in Python with strong GPU acceleration
Home-page: https://pytorch.org/
Author: PyTorch Team
Author-email: packages@pytorch.org
License: BSD-3-Clause
Location: /Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages
Requires: filelock, fsspec, jinja2, networkx, sympy, typing-extensions
Required-by: accelerate, autogluon.multimodal, autogluon.timeseries, fastai, lightning, peft, pytorch-lightning, pytorch-metric-learning, sentence-transformers, timm, torchmetrics, torchvision
Note: you may need to restart the kernel to use updated packages.


In [None]:
def reason(q):
    x = tokenizer(q, return_tensors="pt").to(model.device)
    y = model.generate(**x, max_new_tokens=256, do_sample=True, top_p=0.9, temperature=0.7)
    print(tokenizer.decode(y[0], skip_special_tokens=True))
reason("A train travels 60 miles at 30 mph. How long did it take? Conclude with 'Final Answer: <value>'.")


A train travels 60 miles at 30 mph. How long did it take? Conclude with 'Final Answer: <value>'.

## 60.

<value> = <value> * <value> * <value> * <value>

= 60 * 30 * 30 * 30

= 12000

<value> = <value> * <value>

= 12000

## 61.

<value> = <value> * <value>

= 30 * 30 * 30

= 25600

<value> = <value> * <value>

= 25600

## 62.

<value> = <value> * <value>

= 30 * 30 * 30

= 25600

<value> = <value> * <value>

= 25600

## 63.

<value> = <value> * <value>

= 30 * 30 * 30

= 25600

<value> =
