In [None]:
import torch
from unsloth import FastLanguageModel
import os
from datasets import load_dataset, Dataset, IterableDataset
from pprint import pprint
import re
import wandb
from vllm import SamplingParams
from datetime import datetime

# Check GPU
if torch.cuda.is_available():
    print(f"GPU detected: {torch.cuda.get_device_name(0)}")
else:
    print("No GPU detected.")

# Environment variables for torch
os.environ["TORCH_LOGS"] = "recompiles"
os.environ['TORCHDYNAMO_CACHE_SIZE_LIMIT'] = '999999999'
import torch._dynamo 
torch._dynamo.config.cache_size_limit = 64

In [None]:

SYSTEM_PROMPT = """
Respond in the following format:
<reasoning>
...
</reasoning>
<answer>
...
</answer>
"""

XML_COT_FORMAT = """
<reasoning>
{reasoning}
</reasoning>
<answer>
{answer}
</answer>
"""

In [None]:
MODEL_ID = "google/gemma-3-1b-it"
MAX_SEQ_LENGTH = 1024
LORA_RANK = 32
GPU_UTIL=0.6

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name=MODEL_ID,
    max_seq_length = MAX_SEQ_LENGTH,
    load_in_4bit=True,
    fast_inference=True, #enabled vllm
    max_lora_rank=LORA_RANK,
    gpu_memory_utilization=GPU_UTIL
)

peft_model = FastLanguageModel.get_peft_model(
    model,
    finetune_vision_layers = False,
    finetune_language_layers = True,
    finetune_attention_modules = True,
    finetune_mlp_modules = True,
    r=LORA_RANK,
    lora_alpha=LORA_RANK,
    use_gradient_checkpointing='unsloth', #enable long FT
    random_state = 42
)

print(type(peft_model))


In [None]:
VERSION = datetime.now().strftime("%Y%m%d_%H%M%S")
print("current_verion = ", VERSION)
# model.save_lora('grpo_lora_model')
# model.save_pretrained_merged(f"outputs/grpo_saved_{VERSION}", tokenizer, save_method = "merged_16bit")

text = tokenizer.apply_chat_template(
    [
        {'role': 'system', "content": SYSTEM_PROMPT},
        {'role': 'user', "content": 'Calculate pi'},
    ],
    add_generation_prompt=True,
    return_tensors = "pt"
).to('cuda')

print(text)

from transformers import TextStreamer

streamer = TextStreamer(tokenizer, skip_prompt=True)

res = peft_model.generate(
    text,
    streamer = streamer,
    max_new_tokens=128,
    pad_token_id = tokenizer.eos_token_id
)


In [None]:
# Detokenize the tensor and print the generated text
generated_text = tokenizer.decode(res[0], skip_special_tokens=True)
print(generated_text)

# Evaluation after GRPO

In [None]:

text = tokenizer.apply_chat_template(
    [
        {"role": "system", "content": SYSTEM_PROMPT},
        {"role": "user", "content": "Calculate pi."},
    ],
    tokenize=True,
    add_generation_prompt=True,
    return_tensors="pt"
).to('cuda')

print(text)

sampling_params = SamplingParams(
    temperature=0.8,
    top_p = 0.95,
    max_tokens = 1024
)

output = model.generate(
    text, 
)
