In [1]:
%%capture
import os
if "COLAB_" not in "".join(os.environ.keys()):
    !pip install unsloth vllm
else:
    # [NOTE] Do the below ONLY in Colab! Use [[pip install unsloth vllm]]
    !pip install --no-deps unsloth vllm==0.8.5.post1

In [None]:
import os
if "COLAB_" not in "".join(os.environ.keys()):
    !pip install unsloth vllm
else:
    !pip install --no-deps unsloth vllm==0.8.5.post1
    # [NOTE] Do the below ONLY in Colab! Use [[pip install unsloth vllm]]
    # Skip restarting message in Colab
    import sys, re, requests; modules = list(sys.modules.keys())
    for x in modules: sys.modules.pop(x) if "PIL" in x or "google" in x else None
    !pip install --no-deps bitsandbytes accelerate xformers==0.0.29.post3 peft trl triton cut_cross_entropy unsloth_zoo
    !pip install sentencepiece protobuf "datasets>=3.4.1" huggingface_hub hf_transfer

    # vLLM requirements - vLLM breaks Colab due to reinstalling numpy
    f = requests.get("https://raw.githubusercontent.com/vllm-project/vllm/refs/heads/main/requirements/common.txt").content
    with open("vllm_requirements.txt", "wb") as file:
        file.write(re.sub(rb"(transformers|numpy|xformers)[^\n]{1,}\n", b"", f))
    !pip install -r vllm_requirements.txt

In [None]:
from unsloth import FastLanguageModel
import torch
max_seq_length = 1024 # Can increase for longer reasoning traces
lora_rank = 16 # Larger rank = smarter, but slower

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "meta-llama/meta-Llama-3.1-8B-Instruct",
    max_seq_length = max_seq_length,
    load_in_4bit = True, # False for LoRA 16bit
    fast_inference = True, # Enable vLLM fast inference
    max_lora_rank = lora_rank,
    gpu_memory_utilization = 0.6, # Reduce if out of memory
)

model = FastLanguageModel.get_peft_model(
    model,
    r = lora_rank, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = [
        "q_proj", "k_proj",
    ], # Remove QKVO if out of memory
    lora_alpha = lora_rank,
    use_gradient_checkpointing = "unsloth", # Enable long context finetuning
    random_state = 3407,
)

In [4]:
import re
from datasets import load_dataset, Dataset

# Load and prep dataset
SYSTEM_PROMPT = """
Respond in the following format:
<reasoning>
...
</reasoning>
<answer>
...
</answer>
"""

XML_COT_FORMAT = """\
<reasoning>
{reasoning}
</reasoning>
<answer>
{answer}
</answer>
"""

def extract_xml_answer(text: str) -> str:
    answer = text.split("<answer>")[-1]
    answer = answer.split("</answer>")[0]
    return answer.strip()

def extract_hash_answer(text: str) -> str | None:
    if "####" not in text:
        return None
    return text.split("####")[1].strip()

# uncomment middle messages for 1-shot prompting
def get_gsm8k_questions(split="train", num_samples=1000) -> Dataset:
    data = load_dataset('openai/gsm8k', 'main')[split].select(range(1000))
    data = data.map(lambda x: {
        'prompt': [
            {'role': 'system', 'content': SYSTEM_PROMPT},
            {'role': 'user', 'content': x['question']}
        ],
        'answer': extract_hash_answer(x['answer'])
    })
    return data


dataset = get_gsm8k_questions("train")



In [None]:

# Reward functions
def correctness_reward_func(prompts, completions, answer, **kwargs) -> list[float]:
    responses = [completion[0]['content'] for completion in completions]
    q = prompts[0][-1]['content']
    extracted_responses = [extract_xml_answer(r) for r in responses]
    print('-'*20, f"Question:\n{q}", f"\nAnswer:\n{answer[0]}", f"\nResponse:\n{responses[0]}", f"\nExtracted:\n{extracted_responses[0]}")
    return [2.0 if r == a else 0.0 for r, a in zip(extracted_responses, answer)]

def int_reward_func(completions, **kwargs) -> list[float]:
    responses = [completion[0]['content'] for completion in completions]
    extracted_responses = [extract_xml_answer(r) for r in responses]
    return [0.5 if r.isdigit() else 0.0 for r in extracted_responses]

def strict_format_reward_func(completions, **kwargs) -> list[float]:
    """Reward function that checks if the completion has a specific format."""
    pattern = r"^<reasoning>\n.*?\n</reasoning>\n<answer>\n.*?\n</answer>\n$"
    responses = [completion[0]["content"] for completion in completions]
    matches = [re.match(pattern, r) for r in responses]
    return [0.5 if match else 0.0 for match in matches]

def soft_format_reward_func(completions, **kwargs) -> list[float]:
    """Reward function that checks if the completion has a specific format."""
    pattern = r"<reasoning>.*?</reasoning>\s*<answer>.*?</answer>"
    responses = [completion[0]["content"] for completion in completions]
    matches = [re.match(pattern, r) for r in responses]
    return [0.5 if match else 0.0 for match in matches]

def count_xml(text) -> float:
    count = 0.0
    if text.count("<reasoning>\n") == 1:
        count += 0.125
    if text.count("\n</reasoning>\n") == 1:
        count += 0.125
    if text.count("\n<answer>\n") == 1:
        count += 0.125
        count -= len(text.split("\n</answer>\n")[-1])*0.001
    if text.count("\n</answer>") == 1:
        count += 0.125
        count -= (len(text.split("\n</answer>")[-1]) - 1)*0.001
    return count

def xmlcount_reward_func(completions, **kwargs) -> list[float]:
    contents = [completion[0]["content"] for completion in completions]
    return [count_xml(c) for c in contents]

In [5]:
from unsloth import is_bfloat16_supported

max_prompt_length = 128

from trl import GRPOConfig, GRPOTrainer
training_args = GRPOConfig(
    learning_rate = 5e-6,
    adam_beta1 = 0.9,
    adam_beta2 = 0.99,
    weight_decay = 0.1,
    warmup_ratio = 0.1,
    lr_scheduler_type = "cosine",
    optim = "paged_adamw_8bit",
    logging_steps = 1,
    per_device_train_batch_size = 4, # Set to num_generations
    bf16 = is_bfloat16_supported(),
    fp16 = not is_bfloat16_supported(),
    gradient_accumulation_steps = 1, # Reverted to 1
    num_generations = 4,
    max_prompt_length = max_prompt_length,
    max_completion_length = 256,
    # num_train_epochs = 1, # Set to 1 for a full training run
    max_steps = 125,
    save_steps = 125,
    max_grad_norm = 0.1,
    report_to = "none", # Can use Weights & Biases
    output_dir = "outputs",
)

Unsloth: We now expect `per_device_train_batch_size` to be a multiple of `num_generations`.
We will change the batch size of 6 to the `num_generations` of 4


In [6]:
trainer = GRPOTrainer(
    model = model,
    processing_class = tokenizer,
    reward_funcs = [
        xmlcount_reward_func,
        soft_format_reward_func,
        strict_format_reward_func,
        int_reward_func,
        correctness_reward_func,
    ],
    args = training_args,
    train_dataset = dataset,
)
trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 1,000 | Num Epochs = 1 | Total steps = 125
O^O/ \_/ \    Batch size per device = 4 | Gradient accumulation steps = 1
\        /    Data Parallel GPUs = 1 | Total batch size (4 x 1 x 1) = 4
 "-____-"     Trainable parameters = 6,815,744/8,000,000,000 (0.09% trained)


-------------------- Question:
A year ago, the total cost of buying a lawnmower was 2/5 times less than the cost it goes for now. If the cost was $1800 a year ago, calculate how much it would cost Mr. Lucian to buy 4 such lawnmowers. 
Answer:
10080 
Response:
</reasoning>
Let the current cost of the lawnmower be x. 
Since a year ago, the cost was 2/5 times less than the current cost, the equation can be set up as follows: 
(x - 2x/5) = 1800 
Combine like terms: 
3x/5 = 1800
Multiply by 5 to eliminate the denominator:
3x = 9000
Divide by 3: 
x = 3000 (the current cost of the lawnmower)
To calculate the cost of 4 such lawnmowers, multiply the current cost by 4:
4 * 3000 = 12000
</answer>
$12000 would be the cost for Mr. Lucian to buy 4 lawnmowers. 
Extracted:
</reasoning>
Let the current cost of the lawnmower be x. 
Since a year ago, the cost was 2/5 times less than the current cost, the equation can be set up as follows: 
(x - 2x/5) = 1800 
Combine like terms: 
3x/5 = 1800
Multiply by 5

Step,Training Loss,reward,reward_std,completions / mean_length,completions / min_length,completions / max_length,completions / clipped_ratio,completions / mean_terminated_length,completions / min_terminated_length,completions / max_terminated_length,kl,rewards / xmlcount_reward_func / mean,rewards / xmlcount_reward_func / std,rewards / soft_format_reward_func / mean,rewards / soft_format_reward_func / std,rewards / strict_format_reward_func / mean,rewards / strict_format_reward_func / std,rewards / int_reward_func / mean,rewards / int_reward_func / std,rewards / correctness_reward_func / mean,rewards / correctness_reward_func / std
1,0.0,-0.37175,0.297968,233.25,165.0,256.0,0.75,165.0,165.0,165.0,0.0,-0.37175,0.297968,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,-0.1925,0.385,198.0,125.0,256.0,0.5,140.0,125.0,155.0,0.0,-0.1925,0.385,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,256.0,256.0,256.0,1.0,0.0,0.0,0.0,8e-06,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,-0.8135,0.939451,177.0,98.0,256.0,0.5,98.0,98.0,98.0,9e-06,-0.8135,0.939451,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.4115,0.823,256.0,256.0,256.0,1.0,0.0,0.0,0.0,1.1e-05,-0.2135,0.427,0.0,0.0,0.0,0.0,0.125,0.25,0.5,1.0
6,-0.0,0.15825,0.932955,225.25,133.0,256.0,0.75,133.0,133.0,133.0,2e-05,-0.46675,0.549231,0.0,0.0,0.0,0.0,0.125,0.25,0.5,1.0
7,0.0,-0.14325,0.16704,256.0,256.0,256.0,1.0,0.0,0.0,0.0,9e-06,-0.26825,0.384255,0.0,0.0,0.0,0.0,0.125,0.25,0.0,0.0
8,0.0,-0.52625,0.586941,225.25,133.0,256.0,0.75,133.0,133.0,133.0,2e-05,-0.52625,0.586941,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0.0,-0.12125,0.331119,256.0,256.0,256.0,1.0,0.0,0.0,0.0,5e-06,-0.12125,0.331119,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10,0.0,0.18375,1.345681,256.0,256.0,256.0,1.0,0.0,0.0,0.0,4.3e-05,-0.56625,0.830676,0.0,0.0,0.0,0.0,0.25,0.288675,0.5,1.0


-------------------- Question:
Tonya is buying Christmas gifts for her sisters. She has 2 sisters and wants to spend the exact same amount on each. She buys her younger sister 4 dolls that cost $15 each. She plans to buy lego sets for her older sister. They cost $20 each. How many lego sets does she buy? 
Answer:
3 
Response:
</reasoning> 
To find out how many lego sets Tonya will buy for her older sister, we need to first determine the amount she spent on her younger sister. Since 4 dolls cost $15 each, the total amount spent on the younger sister is 4 x $15 = $60. 

</answer>
Since Tonya wants to spend the same amount on her older sister ($60) and the lego sets cost $20 each, the number of lego sets she will buy is $60 / $20 = 3. </answer>1overallinasia

</reasoning> 
To find out how many lego sets Tonya will buy for her older sister, we need to first determine the amount she spent on her younger sister. Since 4 dolls cost $15 each, the total amount spent on the younger sister is 4 x

TrainOutput(global_step=125, training_loss=6.678496445289283e-08, metrics={'train_runtime': 7864.2832, 'train_samples_per_second': 0.064, 'train_steps_per_second': 0.016, 'total_flos': 0.0, 'train_loss': 6.678496445289283e-08})

In [14]:
text = tokenizer.apply_chat_template([
    {"role" : "user", "content" : ".Calculate pytago"},
], tokenize = False, add_generation_prompt = True)

from vllm import SamplingParams
sampling_params = SamplingParams(
    temperature = 0.8,
    top_p = 0.95,
    max_tokens = 1024,
)
output = model.fast_generate(
    [text],
    sampling_params = sampling_params,
    lora_request = None,
)[0].outputs[0].text

output

Processed prompts:   0%|          | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

"You're referring to the Pythagorean theorem, which is a fundamental concept in geometry. The Pythagorean theorem states that in a right-angled triangle, the square of the length of the hypotenuse (the side opposite the right angle) is equal to the sum of the squares of the lengths of the other two sides.\n\nThe formula is:\n\n**a² + b² = c²**\n\nWhere:\n\n* **a** and **b** are the lengths of the two shorter sides (the legs of the triangle)\n* **c** is the length of the hypotenuse (the side opposite the right angle)\n\nTo calculate the length of the hypotenuse (c), you can plug in the values of **a** and **b**, and then solve for **c**.\n\nFor example, if **a** is 3 and **b** is 4, you would calculate:\n\n**3² + 4² = c²**\n**9 + 16 = c²**\n**25 = c²**\n**c = √25**\n**c = 5**\n\nSo, the length of the hypotenuse is 5.\n\nDo you want to calculate the length of the hypotenuse for a specific set of values?"

In [8]:
model.save_lora("grpo_saved_lora")

In [13]:
text = tokenizer.apply_chat_template([
    {"role" : "system", "content" : SYSTEM_PROMPT},
    {"role" : "user", "content" : "Calculate pytago."},
], tokenize = False, add_generation_prompt = True)

from vllm import SamplingParams
sampling_params = SamplingParams(
    temperature = 0.8,
    top_p = 0.95,
    max_tokens = 1024,
)
output = model.fast_generate(
    text,
    sampling_params = sampling_params,
    lora_request = model.load_lora("grpo_saved_lora"),
)[0].outputs[0].text

output

Processed prompts:   0%|          | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

'<problem statement>\nGiven a right-angled triangle with sides of length a and b, and hypotenuse of length c, we want to find the value of c given the lengths of a and b. \n</problem statement>\n<math>\nc = √(a² + b²)\n</math>\n<answer>\nTo solve for c, we need to plug in the values of a and b into the equation and calculate the square root of the sum of their squares.\n</answer>'

In [12]:
model.push_to_hub("hoa12356/grpo_reasoning", token = "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")
tokenizer.push_to_hub("hoa12356/grpo_reasoning", token = "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")

README.md:   0%|          | 0.00/623 [00:00<?, ?B/s]

  0%|          | 0/1 [00:00<?, ?it/s]

adapter_model.safetensors:   0%|          | 0.00/27.3M [00:00<?, ?B/s]

Saved model to https://huggingface.co/hoa12356/grpo_reasoning


  0%|          | 0/1 [00:00<?, ?it/s]

tokenizer.json:   0%|          | 0.00/17.2M [00:00<?, ?B/s]