<a href="https://colab.research.google.com/github/hieunguyen7337/LLM_RL/blob/main/hangman_game_rl.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# (wandb on; bitsandbytes kept for big-model path)
!pip -q install "trl>=0.16.0" transformers accelerate bitsandbytes peft wandb

In [2]:
import os, torch
from datasets import Dataset
from trl import GRPOTrainer, GRPOConfig
from peft import LoraConfig
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig

In [3]:
# Toggle: use 4-bit for big models only
model_name   = "Qwen/Qwen2.5-0.5B-Instruct"
ENABLE_4BIT  = False      # <- small model: False. Set True for bigger models (e.g., ≥7B).
GRAD_CKPT    = ENABLE_4BIT
USE_CACHE    = not GRAD_CKPT  # avoid the "caching incompatible with checkpointing" spam

In [4]:
os.environ.setdefault("WANDB_PROJECT", "huggingface")   # or "grpo-demos"
os.environ.setdefault("WANDB_LOG_MODEL", "end")

'end'

In [5]:
# Data
# Define a dataset that contains both math and coding problems
dataset = Dataset.from_list(
    [
        {"prompt": "What is 2+2?", "task": "math"},
        {"prompt": "Write a function that returns the sum of two numbers.", "task": "code"},
        {"prompt": "What is 3*4?", "task": "math"},
        {"prompt": "Write a function that returns the product of two numbers.", "task": "code"},
    ]
)

In [6]:
# ---------------------------
# Reward funcs (neutral=0.0 for non-target tasks)
# ---------------------------
def math_reward_func(prompts, completions, task, **kwargs):
    return [1.0 if t == "math" else 0.0 for t in task]

def coding_reward_func(prompts, completions, task, **kwargs):
    return [1.0 if t == "code" else 0.0 for t in task]

In [None]:
# ---------------------------
# Tokenizer
# ---------------------------
tok = AutoTokenizer.from_pretrained(model_name, use_fast=True, padding_side="left")
if tok.pad_token is None:
    tok.pad_token = tok.eos_token

In [None]:
# ---------------------------
# Model (no quant for small; 4-bit kept for big)
# ---------------------------
quant = None
if ENABLE_4BIT:
    quant = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_use_double_quant=True,
        bnb_4bit_compute_dtype=torch.float16,
    )

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map="cuda:0" if not ENABLE_4BIT else "auto",
    torch_dtype=torch.float16,
    attn_implementation="sdpa",
    use_cache=USE_CACHE,
    quantization_config=quant,
)

In [9]:
# ---------------------------
# LoRA (kept for both; safe with/without quant)
# ---------------------------
peft_cfg = LoraConfig(
    r=8, lora_alpha=16, lora_dropout=0.05, bias="none",
    task_type="CAUSAL_LM",
    target_modules=["q_proj","k_proj","v_proj","o_proj","gate_proj","up_proj","down_proj"],
)

In [10]:
# ---------------------------
# GRPO config (W&B enabled; will save at epoch end)
# ---------------------------
args = GRPOConfig(
    output_dir="qwen2.5-0.5b-grpo",
    per_device_train_batch_size=4,  # keep divisible by num_generations
    gradient_accumulation_steps=2,
    num_generations=8,              # default is 8, batch must be divisible by this
    max_prompt_length=128,          # default 512
    max_completion_length=64,       # default 256
    fp16=True,                      # T4 uses fp16
    gradient_checkpointing=GRAD_CKPT,
    report_to="wandb",              # <-- keep W&B reporting
    run_name="qwen2.5-0.5b-noquant",  # change per run
    logging_steps=5,
    save_strategy="epoch",          # save at epoch end too
    save_total_limit=2,
)

In [12]:
trainer = GRPOTrainer(
    model=model,
    reward_funcs=[math_reward_func, coding_reward_func],
    train_dataset=dataset,
    args=args,
    peft_config=peft_cfg, # LoRA reduces trainable params & VRAM
)

In [None]:
trainer.train()

In [14]:
save_dir = args.output_dir

In [15]:
# ---------------------------
# Save: adapter/weights + tokenizer + trainer state
# ---------------------------
trainer.save_model(save_dir)        # saves PEFT adapter (and weights) appropriately
tok.save_pretrained(save_dir)
trainer.save_state()

In [16]:
# (Optional) If you're NOT using 4-bit, also export a merged FP16 model without LoRA adapters:
if not ENABLE_4BIT:
    try:
        merged = trainer.model.merge_and_unload()
        merged_dir = os.path.join(save_dir, "merged-fp16")
        merged.save_pretrained(merged_dir)
        tok.save_pretrained(merged_dir)
        print(f"Merged full model saved to: {merged_dir}")
    except Exception as e:
        print("Merge skipped (not a PEFT model or unsupported):", e)

Merged full model saved to: qwen2.5-0.5b-grpo/merged-fp16


In [18]:
import shutil

# compress the folder
shutil.make_archive("qwen2.5-0.5b-grpo", 'zip', "qwen2.5-0.5b-grpo")

# now download to your computer
from google.colab import files
files.download("qwen2.5-0.5b-grpo.zip")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>