In [1]:
import torch
from unsloth import FastLanguageModel
import os
from datasets import load_dataset, Dataset, IterableDataset
from pprint import pprint
import re
import wandb
from vllm import SamplingParams
from datetime import datetime

# Check GPU
if torch.cuda.is_available():
    print(f"GPU detected: {torch.cuda.get_device_name(0)}")
else:
    print("No GPU detected.")

# Environment variables for torch
os.environ["TORCH_LOGS"] = "recompiles"
os.environ['TORCHDYNAMO_CACHE_SIZE_LIMIT'] = '999999999'
import torch._dynamo 
torch._dynamo.config.cache_size_limit = 64

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.


  from .autonotebook import tqdm as notebook_tqdm


🦥 Unsloth Zoo will now patch everything to make training faster!
INFO 07-20 15:49:08 [__init__.py:244] Automatically detected platform cuda.
GPU detected: NVIDIA GeForce RTX 3060


In [2]:

reasoning_start = "<start_working_out>"
reasoning_end = "<end_working_out>"
solution_start = "<solution>"
solution_end = "</solution>"

system_prompt = \
    f"""You are given a problem, think about the problem and provide your workout. 
    Place it between {reasoning_start} and {reasoning_end}. Then provide your solution
    between {solution_start}{solution_end}"""

print(system_prompt)


You are given a problem, think about the problem and provide your workout. 
    Place it between <start_working_out> and <end_working_out>. Then provide your solution
    between <solution></solution>


# Prepare dataset

In [None]:
# This regular expression is used to match a specific format in a string, typically for extracting
# the solution part from a text that contains both reasoning and solution sections.
# - It expects the string to start with optional whitespace.
# - Then it looks for the reasoning section, which starts with the value of `reasoning_start`,
#   contains any characters (non-greedy), and ends with `reasoning_end`.
# - After that, it expects the solution section, which starts with `solution_start`,
#   captures everything up to `solution_end` (the solution itself is captured in a group).
# - Finally, it expects optional whitespace at the end of the string.
# - The flags `re.MULTILINE` and `re.DOTALL` allow the regex to match across multiple lines
#   and let the dot (`.`) match newline characters as well.

match_format = re.compile(
    rf"^[\s]{{0,}}"\
    rf"{reasoning_start}.+?{reasoning_end}.*?"\
    rf"{solution_start}(.+?){solution_end}"\
    rf"[\s]{{0,}}$",
    flags = re.MULTILINE | re.DOTALL
)

#test
res = match_format.search(
    "<start_working_out>Let me think!<end_working_out>"\
    "<solution>2</solution>",
)

print(res)

<re.Match object; span=(0, 71), match='<start_working_out>Let me think!<end_working_out>>


# Rewards function

In [None]:
def match_format_exactly(completions, **kwargs):
    reward = kwargs.get("reward", 3.0)
    scores = []
    for completion in completions:
        score = 0
        

In [5]:
MODEL_ID = "unsloth/gemma-3-1b-it"
MAX_SEQ_LENGTH = 1024
LORA_RANK = 32
GPU_UTIL=0.6

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name=MODEL_ID,
    max_seq_length = MAX_SEQ_LENGTH,
    load_in_4bit=True,
    fast_inference=True, #enabled vllm
    max_lora_rank=LORA_RANK,
    gpu_memory_utilization=GPU_UTIL
)

peft_model = FastLanguageModel.get_peft_model(
    model,
    finetune_vision_layers = False,
    finetune_language_layers = True,
    finetune_attention_modules = True,
    finetune_mlp_modules = True,
    r=LORA_RANK,
    lora_alpha=LORA_RANK,
    use_gradient_checkpointing='unsloth', #enable long FT
    bias='none',
    random_state = 42
)

print(type(peft_model))


==((====))==  Unsloth 2025.7.5: Fast Gemma3 patching. Transformers: 4.53.2. vLLM: 0.9.2.
   \\   /|    NVIDIA GeForce RTX 3060. Num GPUs = 1. Max memory: 11.622 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.7.0+cu126. CUDA: 8.6. CUDA Toolkit: 12.6. Triton: 3.3.0
\        /    Bfloat16 = TRUE. FA [Xformers = None. FA2 = True]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
Unsloth: Making `model.base_model.model.model` require gradients
<class 'peft.peft_model.PeftModelForCausalLM'>


In [None]:
VERSION = datetime.now().strftime("%Y%m%d_%H%M%S")
print("current_verion = ", VERSION)
# model.save_lora('grpo_lora_model')
# model.save_pretrained_merged(f"outputs/grpo_saved_{VERSION}", tokenizer, save_method = "merged_16bit")

text = tokenizer.apply_chat_template(
    [
        {'role': 'system', "content": SYSTEM_PROMPT},
        {'role': 'user', "content": 'Calculate pi'},
    ],
    add_generation_prompt=True,
    return_tensors = "pt"
).to('cuda')

print(text)

from transformers import TextStreamer

streamer = TextStreamer(tokenizer, skip_prompt=True)

res = peft_model.generate(
    text,
    streamer = streamer,
    max_new_tokens=128,
    pad_token_id = tokenizer.eos_token_id
)


current_verion =  20250720_101140
tensor([[     2,    105,   2364,    108, 118904,    528,    506,   2269,   6518,
         236787,    107, 236820,  27388,    522, 236813,    107,   1390,    107,
            954,  27388,    522, 236813,    107, 236820,  14433, 236813,    107,
           1390,    107,    954,  14433, 236813,    109,  50702,   4604,    106,
            107,    105,   4368,    107]], device='cuda:0')
<reasoning>
Pi (π) is a mathematical constant that represents the ratio of a circle’s circumference to its diameter. It’s approximately 3.14159.  It’s an irrational number, meaning its decimal representation goes on forever without repeating.  The value of Pi is defined as the ratio of a circle’s circumference to its diameter.

</reasoning>
<answer>
3.14159...
</answer><end_of_turn>


In [19]:
# Detokenize the tensor and print the generated text
generated_text = tokenizer.decode(res[0], skip_special_tokens=True)
print(generated_text)

user

Respond in the following format:
<reasoning>
...
</reasoning>
<answer>
...
</answer>


Calculate pi
model
<reasoning>
Pi (π) is a mathematical constant that represents the ratio of a circle’s circumference to its diameter. It’s approximately 3.14159.  It’s an irrational number, meaning its decimal representation goes on forever without repeating.  The value of Pi is defined as the ratio of a circle’s circumference to its diameter.

</reasoning>
<answer>
3.14159...
</answer>


# Evaluation after GRPO

In [None]:

text = tokenizer.apply_chat_template(
    [
        {"role": "system", "content": SYSTEM_PROMPT},
        {"role": "user", "content": "Calculate pi."},
    ],
    tokenize=True,
    add_generation_prompt=True,
    return_tensors="pt"
).to('cuda')

print(text)

sampling_params = SamplingParams(
    temperature=0.8,
    top_p = 0.95,
    max_tokens = 1024
)

output = model.generate(
    text, 
)


tensor([[     2,    105,   2364,    108, 118904,    528,    506,   2269,   6518,
         236787,    107, 236820,  27388,    522, 236813,    107,   1390,    107,
            954,  27388,    522, 236813,    107, 236820,  14433, 236813,    107,
           1390,    107,    954,  14433, 236813,    109,  50702,   4604, 236761,
            106,    107,    105,   4368,    107]], device='cuda:0')


ValueError: The following `model_kwargs` are not used by the model: ['sampling_params'] (note: typos in the generate arguments will also show up in this list)

AssertionError: 