In [None]:
!pip install accelerate datasets transformers trl torch peft wandb jinja2==3.1.0 vllm antlr4-python3-runtime sympy python-Levenshtein math-verify[antlr4_13_2]

In [1]:
import torch
import transformers
import trl
import jinja2
import vllm
import peft

print(f"PyTorch version: {torch.__version__}")
print(f"Transformers backend: {transformers.file_utils.is_torch_available()}")
print(f"{jinja2.__version__ }")
print(f"{vllm.__version__ }")
print(f"{peft.__version__}")

  from .autonotebook import tqdm as notebook_tqdm


INFO 02-23 13:35:57 __init__.py:183] Automatically detected platform cuda.


2025-02-23 13:35:57,581	INFO util.py:154 -- Missing packages: ['ipywidgets']. Run `pip install -U ipywidgets`, then restart the notebook server for rich notebook output.


PyTorch version: 2.5.1+cu124
Transformers backend: True
3.1.4
0.7.0
0.14.0


In [1]:
"""
Reference:

https://gist.github.com/willccbb/4676755236bb08cab5f4e54a0475d6fb
"""
import re
import torch
from datasets import load_dataset, Dataset
from transformers import AutoTokenizer, AutoModelForCausalLM

In [2]:
# load and prepare ds
SYSTEM_PROMPT = """
Responde in the following format:
<reasoning>
...
</reasoning>
<answer>
...
</answer>
"""

XML_COT_FORMAT = """\
<reasoning>
{reasoning}
</reasoning>
<answer>
{answer}
</answer>
"""
SYSTEM_PROMPT

'\nResponde in the following format:\n<reasoning>\n...\n</reasoning>\n<answer>\n...\n</answer>\n'

In [3]:
def extract_xml_answer(text:str):
    answer = text.split("<answer>")[-1]
    answer = answer.split("</answer>")[0]
    return answer.strip()

def extract_hash_answer(text:str):
    if '####' not in text:
        return None
    return text.split('####')[1].strip()

def get_gsm8k_questions(split='train'):
    data = load_dataset('openai/gsm8k', 'main')[split]
    data = data.map(lambda x: {
        'prompt':[
            {'role':'system', 'content': SYSTEM_PROMPT},
            {'role':'user', 'content': x['question']}
        ],
        'answer': extract_hash_answer(x['answer'])
    }) # type: ignore
    return data

def get_math200k_questions(split='train'):
    data = load_dataset('open-r1/OpenR1-Math-220k', 'default')[split]
    data = data.map(lambda x: {
        'prompt':[
            {'role':'system', 'content': SYSTEM_PROMPT},
            {'role':'user', 'content': x['problem']}
        ],
        'answer': x['answer']
    }) # type: ignore
    return data.remove_columns(["messages", "correctness_math_verify", "is_reasoning_complete", "generations", "correctness_llama", "correctness_count"])
    
print(extract_xml_answer("<answer>123</answer>"))
print(extract_hash_answer('12 #### asas'))

123
asas


In [4]:
dataset = get_gsm8k_questions()
dataset

Dataset({
    features: ['question', 'answer', 'prompt'],
    num_rows: 7473
})

In [5]:
# from pylatexenc.latexwalker import LatexWalker, LatexWalkerParseError

# def is_valid_latex(latex_str):
#     try:
#         # Create a LatexWalker instance with your input string
#         walker = LatexWalker(latex_str)
#         # Try parsing the entire string into nodes
#         nodelist, pos, len_ = walker.get_latex_nodes()
#         return True  # Parsed without throwing an error
#     except LatexWalkerParseError as e:
#         print(f"Parse error: {e}")
#         return False

# # Example usage
# sample = r"}sadka;sd}"
# print(is_valid_latex(sample))  # Should print True

In [6]:
# import Levenshtein

# str1 = "a/b"
# str2 = "a/b"

# distance = Levenshtein.distance(str1, str2)
# similarity_ratio = Levenshtein.ratio(str1, str2)

# print(f"Levenshtein Distance: {distance}")
# print(f"Similarity Ratio: {similarity_ratio}")


In [7]:
# step = 0
# reward functions
import Levenshtein
# from math_verify import parse, verify

def correctness_reward_func(prompts, completions, answer, **kwargs):
    responses = [completion[0]['content'] for completion in completions]
    q = prompts[0][-1]['content'] # [{role:system},{role:user},{role:assistance}]
    extracted_responses = [extract_xml_answer(r) for r in responses]
    if '</answer>' in responses[0] and '<answer>' in responses[0] and '<reasoning>' in responses[0] and '</reasoning>' in responses[0]:
        # print('-'*20, f"Question:\n{q}\n", '-'*20, f"Answer:\n{answer[0]}\n", '-'*20, f"Response:\n{responses[0]}\n", '-'*20, f"Extracted:\n{extracted_responses[0]}",'\n\n')
        print('-'*20, f"Question:\n{q}\n", f"Answer: {answer[0]}\n", f"Extracted: {extracted_responses[0]}")
    
    return [3.0 if r.strip()==a.strip() else 0.0 for r,a in zip(extracted_responses, answer)]

def similarity_reward_func(completions, answer, **kwargs):
    responses = [completion[0]['content'] for completion in completions]
    extracted_responses = [extract_xml_answer(r) for r in responses]
    return [Levenshtein.ratio(r, a) for r,a in zip(extracted_responses, answer)]
    
def int_reward_func(completions, **kwargs):
    responses = [completion[0]['content'] for completion in completions]
    extracted_responses = [extract_xml_answer(r) for r in responses]
    return [1.0 if r.isdigit() else 0.0 for r in extracted_responses]

def strict_format_reward_func(completions, **kwargs):
    """
    Reward function that checks if the completion has as specific format
    """
    pattern = r"^<reasoning>.*</reasoning>\n<answer>.*?</answer>$"
    responses = [completion[0]['content'] for completion in completions]
    matches = [re.match(pattern, r, re.DOTALL) for r in responses]
    return [1.0 if match else 0.0 for match in matches]

def soft_format_reward_func(completions, **kwargs):
    """
    Reward function that checks if the completion has a specific format
    """
    pattern = r"<reasoning>.*</reasoning>\n<answer>.*?</answer>"
    responses = [completion[0]['content'] for completion in completions]
    matches = [re.match(pattern, r, re.DOTALL) for r in responses]
    return [0.5 if match else 0.0 for match in matches]

def count_xml(text):
    count = 0.0
    if text.count("<reasoning>\n") == 1:
        count+=0.125
    if text.count("\n</reasoning>\n") == 1:
        count+=0.125
    if text.count("\n<answer>\n") == 1:
        count+=0.125
        count -= len(text.split("\n</answer>\n")[-1]) * 0.001
    if text.count("\n</answer>") == 1:
        count+=0.125
        count-=(len(text.split("\n</answer>")[-1])-1)*0.001
    return count

def xml_count_reward_func(completions, **kwargs):
    contents = [completion[0]['content'] for completion in completions]
    return [count_xml(c) for c in contents]



prompts = [
    [{"role": "assistant", "content": "<reasoning>\nThe sum of 1 and 2 is 3, which we multiply by 4 to get 12.\n</reasoning>\n<answer>\n(1 + 2) * 4 = 12\n</answer>"}],
]
completions = [
    [{"role": "assistant", "content": "<reasoning>\nThe sum of 1 and 2 is 3, which we multiply by 4 to get 12.\n</reasoning>\n<answer>\n(1 + 2) * 4 = 12\n</answer>"}],
    # [{"role": "assistant", "content": "The sum of 3 and 1 is 4, which we multiply by 2 to get 8. So (3 + 1) * 2 = 8."}],
]

print(xml_count_reward_func(completions))
# 0.382
print(correctness_reward_func(prompts, completions, ['(1 + 2) * 4 = 12']))


[0.382]
-------------------- Question:
<reasoning>
The sum of 1 and 2 is 3, which we multiply by 4 to get 12.
</reasoning>
<answer>
(1 + 2) * 4 = 12
</answer>
 Answer: (1 + 2) * 4 = 12
 Extracted: (1 + 2) * 4 = 12
[3.0]


In [8]:
# <reasoning>
# First, let's count the number of people in each section:

# - Orchestra section: 1 (Sebastian, the drummer)
# - Brass section: 7 people (4 trombones, 2 trumpets, 1 French horn)
# - Strings section: 5 people (3 violins, 1 cellist, 1 contrabassist)
# - Woodwinds section: 3 clarinets + 4 flutes = 7 people

# Now, let's add up the total number of people:

# Orchestra section: 1
# Brass section: 7
# Strings section: 5
# Woodwinds section: 7

# Total number of people in the orchestra: 1 + 7 + 5 + 7 = 20
# </reasoning>

# <answer>
# 20
# </answer>

text = '''<reasoning>
First, let's count the number of people in each section:

- Orchestra section: 1 (Sebastian, the drummer)
- Brass section: 7 people (4 trombones, 2 trumpets, 1 French horn)
- Strings section: 5 people (3 violins, 1 cellist, 1 contrabassist)
- Woodwinds section: 3 clarinets + 4 flutes = 7 people

Now, let's add up the total number of people:

Orchestra section: 1
Brass section: 7
Strings section: 5
Woodwinds section: 7

Total number of people in the orchestra: 1 + 7 + 5 + 7 = 20
</reasoning>
<answer>
20
</answer>'''


# def strict_format_reward_func(completions, **kwargs) -> list[float]:
#     """Reward function that checks if the completion has a specific format."""
#     # pattern = r"^<reasoning>.*?</reasoning><answer>\n.*?\n</answer>\n$"
#     pattern = r"^<reasoning>.*</reasoning>\n<answer>.*?</answer>$"
#     responses = [completion[0]["content"] for completion in completions]
#     matches = [re.match(pattern, r, re.DOTALL) for r in responses]
#     return [0.5 if match else 0.0 for match in matches]
    
completions = [
    [{"role": "assistant", "content": text}],
    # [{"role": "assistant", "content": "The sum of 3 and 1 is 4, which we multiply by 2 to get 8. So (3 + 1) * 2 = 8."}],
]

strict_format_reward_func(completions)

[1.0]

In [9]:
import re

text = '''<reasoning>
First, let's count the number of people in each section:

- Orchestra section: 1 (Sebastian, the drummer)
- Brass section: 7 people (4 trombones, 2 trumpets, 1 French horn)
- Strings section: 5 people (3 violins, 1 cellist, 1 contrabassist)
- Woodwinds section: 3 clarinets + 4 flutes = 7 people

Now, let's add up the total number of people:

Orchestra section: 1
Brass section: 7
Strings section: 5
Woodwinds section: 7

Total number of people in the orchestra: 1 + 7 + 5 + 7 = 20
</reasoning>
<answer>
20
</answer>'''

# Use re.DOTALL to ensure . matches newline characters
pattern = r"^<reasoning>.*</reasoning>\n<answer>.*?</answer>$"

match = re.match(pattern, text, re.DOTALL)

if match:
    print("Match found!")
else:
    print("No match.")

Match found!


In [10]:
from trl import GRPOConfig, SFTConfig
from datetime import datetime

stime = datetime.now().strftime('%Y-%M-%d %H:%m:%S')

ds_name = 'gsm8k'
param_size = "1b"
#model_patch = f'Qwen2.5-{param_size}-Instruct'
model_patch = f'gemma-3-{param_size}-it'
# model_name = f"Qwen/Qwen2.5-{param_size}-Instruct"
#model_name = f'Qwen/{model_patch}'
model_name = f'google/{model_patch}'

output_dir=f"outputs/{model_patch}-GRPO-peft"
run_name=f"{model_patch}-GRPO-{ds_name}-{stime}"

training_args = GRPOConfig(
    output_dir=output_dir,
    run_name=run_name,
    learning_rate=5e-6,
    adam_beta1 = 0.9,
    adam_beta2 = 0.99,
    weight_decay = 0.1,
    warmup_ratio = 0.1,
    lr_scheduler_type = 'cosine',
    logging_steps=1,
    bf16=True,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=4,
    gradient_accumulation_steps=4,
    num_generations=4,
    max_prompt_length=256,
    max_completion_length=200,
    num_train_epochs=1,
    save_steps=100,
    max_grad_norm=0.1,
    log_on_each_node=False,
    use_vllm=False,
    vllm_gpu_memory_utilization=0.3,
    vllm_device='cuda:0',
    report_to='wandb',
)

# model = AutoModelForCausalLM.from_pretrained(
#     model_name,
#     torch_dtype=torch.bfloat16,
#     device_map="auto"
# )

tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token

## train

In [11]:
from trl import GRPOTrainer
from peft import LoraConfig
import torch
from accelerate import notebook_launcher

# def main():
peft_config = LoraConfig(
    r=8,
    lora_alpha=16,
    lora_dropout=0.1,
    task_type='CAUSAL_LM',
    target_modules=["q_proj", "v_proj"],
)
# peft_config = None


trainer = GRPOTrainer(
    model=model_name,
    processing_class=tokenizer,
    reward_funcs=[
        xml_count_reward_func,
        soft_format_reward_func,
        strict_format_reward_func,
        int_reward_func,
        correctness_reward_func,
        #similarity_reward_func,
    ],
    args=training_args,
    train_dataset=dataset,
    peft_config=peft_config,
    # peft_config
)

print('param=', param_size, 'peft', peft_config)
trainer.train()

trainer.save_model(output_dir)

# notebook_launcher(main, args=(), num_processes=1, mixed_precision='bf16')

[2025-04-01 14:34:13,073] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect)


/home/samtang/miniconda3/envs/rl/compiler_compat/ld: cannot find -laio: No such file or directory
collect2: error: ld returned 1 exit status
/home/samtang/miniconda3/envs/rl/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::runtime_error::~runtime_error()@GLIBCXX_3.4'
/home/samtang/miniconda3/envs/rl/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `__gxx_personality_v0@CXXABI_1.3'
/home/samtang/miniconda3/envs/rl/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::ostream::tellp()@GLIBCXX_3.4'
/home/samtang/miniconda3/envs/rl/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::chrono::_V2::steady_clock::now()@GLIBCXX_3.4.19'
/home/samtang/miniconda3/envs/rl/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::string::_M_replace_aux(unsigned long, unsigned long, unsigned long, char)@GLIBCXX_3.4'
/home/samtang/miniconda3/envs/

INFO 04-01 14:34:13 __init__.py:183] Automatically detected platform cuda.


No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


param= 1b peft LoraConfig(task_type='CAUSAL_LM', peft_type=<PeftType.LORA: 'LORA'>, auto_mapping=None, base_model_name_or_path='google/gemma-3-1b-it', revision=None, inference_mode=False, r=8, target_modules={'v_proj', 'q_proj'}, exclude_modules=None, lora_alpha=16, lora_dropout=0.1, fan_in_fan_out=False, bias='none', use_rslora=False, modules_to_save=None, init_lora_weights=True, layers_to_transform=None, layers_pattern=None, rank_pattern={}, alpha_pattern={}, megatron_config=None, megatron_core='megatron.core', loftq_config={}, eva_config=None, use_dora=False, layer_replication=None, runtime_config=LoraRuntimeConfig(ephemeral_gpu_offload=False), lora_bias=False)


[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mdruidlangde[0m ([33mdruidlangde-tencent[0m). Use [1m`wandb login --relogin`[0m to force relogin


`generation_config` default values have been modified to match model-specific defaults: {'cache_implementation': 'hybrid', 'top_k': 64, 'top_p': 0.95, 'bos_token_id': 2, 'eos_token_id': [1, 106]}. If this is not desired, please set these values explicitly.
It is strongly recommended to train Gemma3 models with the `eager` attention implementation instead of `sdpa`. Use `eager` with `AutoModelForCausalLM.from_pretrained('<path-to-checkpoint>', attn_implementation='eager')`.


-------------------- Question:
There are 15 tables in the school's cafeteria. Each table can seat 10 people. Usually, only 1/10 of the seats are left unseated. How many seats are usually taken?
 Answer: 135
 Extracted: 15


Step,Training Loss
1,-0.0
2,0.0
3,0.0
4,0.0
5,0.0001
6,0.0
7,0.0001
8,0.0001
9,0.0
10,0.0001


-------------------- Question:
Boston had .5 feet of snow on the first day of winter.  The next day they got an additional 8 inches.  Over the next 2 days, 2 inches of the snow melted.  On the fifth day, they received another 2 times the amount of snow they received on the first day.  How many feet of snow do they now have?
 Answer: 2
 Extracted: 7.5
-------------------- Question:
Sebastian plays drums for percussion in an orchestra. He sits next to the brass section where four people play trombone, two play trumpet, and one plays a French horn. In front of them are the strings section with three violinists, a cellist, and a contrabassist. Next to the strings are a trio of clarinets and a quartet of flute players in the woodwinds section. The maestro conducts them all. How many people are in the orchestra?
 Answer: 21
 Extracted: 12
-------------------- Question:
For a school fundraiser, Tory needs to sell 50 packs of cookies. So far, he has sold 12 packs to his grandmother, 7 packs to

In [None]:
# trainer.push_to_hub()

## eval

> 1.5b
> - ***** eval metrics *****
> - eval_loss               =        0.0
> - eval_runtime            = 0:29:24.17
> - eval_samples            =       1319
> - eval_samples_per_second =      0.748
> - eval_steps_per_second   =      0.187

> 0.5b

In [None]:
from evalscope.run import run_task
import os

os.environ['all_proxy'] = ''

task_cfg = {
    'model': 'outputs/Qwen-0.5B-GRPO-Continuous',
    'datasets': ['gsm8k', 'math_500'],
    'dataset_args': {"gsm8k": {"system_prompt": "\nResponde in the following format:\n<reasoning>\n...\n</reasoning>\n<answer>\n...\n</answer>\n"}, "math_500":{"system_prompt": "\nResponde in the following format:\n<reasoning>\n...\n</reasoning>\n<answer>\n...\n</answer>\n"}},
}

run_task(task_cfg=task_cfg)

In [9]:
eval_dataset = get_gsm8k_questions(split='test')
eval_dataset

Dataset({
    features: ['question', 'answer', 'prompt'],
    num_rows: 1319
})

## inference

In [1]:
from vllm import LLM
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import PeftModel

output_dir = 'outputs/gemma-3-1b-it-GRPO-peft'
# output_dir = 'Qwen/Qwen2.5-0.5B-Instruct'
model = AutoModelForCausalLM.from_pretrained(output_dir)
# model = AutoModelForCausalLM.from_pretrained('Qwen/Qwen2.5-1.5B-Instruct')
# model = PeftModel.from_pretrained(model, output_dir)
# model = model.merge_and_unload()
model.to('cuda')
tokenizer = AutoTokenizer.from_pretrained(output_dir)

INFO 04-01 19:11:35 __init__.py:183] Automatically detected platform cuda.


In [5]:
from transformers import TextStreamer
import torch

SYSTEM_PROMPT = """
Responde in the following format:
<reasoning>
...
</reasoning>
<answer>
...
</answer>
"""

def generate_with_stream(input_text):
    print(f"\n输入: \n{input_text}")
    print("\n输出:")

    prompts = [
        {"role":"system", "content": SYSTEM_PROMPT},
        {"role":"user", "content":input_text},
    ]
    chats = tokenizer.apply_chat_template(prompts, tokenize=False, add_generation_prompt=True)
    inputs = tokenizer(chats, return_tensors="pt").to("cuda")
    streamer = TextStreamer(tokenizer)
    
    with torch.no_grad():
        outputs = model.generate(
            inputs['input_ids'],
            max_length=512,
            num_return_sequences=1,
            temperature=0.7,
            do_sample=True,
            streamer=streamer
        )
    
    # 完整结果
    full_response = tokenizer.decode(outputs[0], skip_special_tokens=True)


# 使用
input_text = "1+3*9/2+1.2+(1+2*3)=?"
# input_text = '''
# A very special island is inhabited only by knights and knaves. Knights always tell
# the truth, and knaves always lie. You meet 2 inhabitants: Zoey, and Oliver. Zoey remarked,
# "Oliver is not a knight". Oliver stated, "Oliver is a knight if and only if Zoey is a knave". So
# who is a knight and who is a knave?
# '''
generate_with_stream(input_text)


输入: 
1+3*9/2+1.2+(1+2*3)=?

输出:
<bos><bos><start_of_turn>user

Responde in the following format:
<reasoning>
...
</reasoning>
<answer>
...
</answer>


1+3*9/2+1.2+(1+2*3)=?<end_of_turn>
<start_of_turn>model
<reasoning>
This problem involves a series of operations. We need to follow the order of operations (PEMDAS/BODMAS): Parentheses/Brackets, Exponents/Orders, Multiplication and Division (from left to right), Addition and Subtraction (from left to right).
First, we solve the expression inside the parentheses: 1.2
Next, we perform the multiplication: 1.2 * 3 = 3.6
Then, we perform the division: 9 / 2 = 4.5
Now, we have 1 + 3.6 + 1.2 + 4.5 = 1 + 3.6 + 1.2 + 4.5
Next, we perform the addition: 1 + 3.6 + 1.2 + 4.5 = 4.6 + 1.2 + 4.5 = 5.8 + 4.5 = 10.3
Therefore, 1 + 3*9/2 + 1.2 + (1+2*3) = 10.3
</reasoning>
<answer>10.3</answer><end_of_turn>


## test util


s0 = torch.randn(2,3,5)
s1 = torch.randn(2,3,5)

In [13]:
torch.cat([s0,s1], dim=1).shape

torch.Size([2, 6, 5])

In [14]:
torch.stack([s0, s1],dim=1).shape

torch.Size([2, 2, 3, 5])

In [15]:
from torch import nn
# 定义 LSTM
lstm = nn.LSTM(input_size=5, hidden_size=10, num_layers=1, batch_first=True)

# 生成随机输入 (batch_size=2, seq_len=3, input_size=5)
x = torch.randn(2, 3, 5)

# 前向传播
output, (hn, cn) = lstm(x)

print(f"output shape: {output.shape}")  # (2, 3, 10)
print(f"hn shape: {hn.shape}")          # (1, 2, 10)
print(f"cn shape: {cn.shape}")          # (1, 2, 10)

output shape: torch.Size([2, 3, 10])
hn shape: torch.Size([1, 2, 10])
cn shape: torch.Size([1, 2, 10])


In [1]:
from evalscope.run import run_task

task_cfg = {
    'model': 'Qwen/Qwen2.5-0.5B-Instruct',
    'datasets': ['gsm8k', 'arc'],
    'limit': 5
}

run_task(task_cfg=task_cfg)

ModuleNotFoundError: No module named 'lighteval'