In [None]:
!pip install accelerate datasets transformers trl torch peft wandb jinja2==3.1.0 vllm

In [1]:
import torch
import transformers
import trl
import jinja2
import vllm
import peft

print(f"PyTorch version: {torch.__version__}")
print(f"Transformers backend: {transformers.file_utils.is_torch_available()}")
print(f"{jinja2.__version__ }")
print(f"{vllm.__version__ }")
print(f"{peft.__version__}")

  from .autonotebook import tqdm as notebook_tqdm


INFO 02-23 13:35:57 __init__.py:183] Automatically detected platform cuda.


2025-02-23 13:35:57,581	INFO util.py:154 -- Missing packages: ['ipywidgets']. Run `pip install -U ipywidgets`, then restart the notebook server for rich notebook output.


PyTorch version: 2.5.1+cu124
Transformers backend: True
3.1.4
0.7.0
0.14.0


In [1]:
"""
Reference:

https://gist.github.com/willccbb/4676755236bb08cab5f4e54a0475d6fb
"""
import re
import torch
from datasets import load_dataset, Dataset
from transformers import AutoTokenizer, AutoModelForCausalLM

In [2]:
# load and prepare ds
SYSTEM_PROMPT = """
Responde in the following format:
<reasoning>
...
</reasoning>
<answer>
...
</answer>
"""

XML_COT_FORMAT = """\
<reasoning>
{reasoning}
</reasoning>
<answer>
{answer}
</answer>
"""

In [3]:
def extract_xml_answer(text:str):
    answer = text.split("<answer>")[-1]
    answer = answer.split("</answer>")[0]
    return answer.strip()

def extract_hash_answer(text:str):
    if '####' not in text:
        return None
    return text.split('####')[1].strip()

def get_gsm8k_questions(split='train'):
    data = load_dataset('openai/gsm8k', 'main')[split]
    data = data.map(lambda x: {
        'prompt':[
            {'role':'system', 'content': SYSTEM_PROMPT},
            {'role':'user', 'content': x['question']}
        ],
        'answer': extract_hash_answer(x['answer'])
    }) # type: ignore
    return data
print(extract_xml_answer("<answer>123</answer>"))
print(extract_hash_answer('12 #### asas'))

123
asas


In [4]:
dataset = get_gsm8k_questions()
dataset

Using the latest cached version of the dataset since openai/gsm8k couldn't be found on the Hugging Face Hub
Found the latest cached dataset configuration 'main' at /home/samtang/.hf/datasets/openai___gsm8k/main/0.0.0/e53f048856ff4f594e959d75785d2c2d37b678ee (last modified on Sun Feb 23 11:21:12 2025).


Dataset({
    features: ['question', 'answer', 'prompt'],
    num_rows: 7473
})

In [5]:
# step = 0
# reward functions
def correctness_reward_func(prompts, completions, answer, **kwargs):
    responses = [completion[0]['content'] for completion in completions]
    q = prompts[0][-1]['content'] # [{role:system},{role:user},{role:assistance}]
    extracted_responses = [extract_xml_answer(r) for r in responses]
    if '</answer>' in responses[0] and '<answer>' in responses[0] and '<reasoning>' in responses[0] and '</reasoning>' in responses[0]:
        # print('-'*20, f"Question:\n{q}\n", '-'*20, f"Answer:\n{answer[0]}\n", '-'*20, f"Response:\n{responses[0]}\n", '-'*20, f"Extracted:\n{extracted_responses[0]}",'\n\n')
        print('-'*20, f"Question:\n{q}\n", f"Answer: {answer[0]}\n", f"Extracted: {extracted_responses[0]}")
    return [10.0 if r==a else 0.0 for r,a in zip(extracted_responses, answer)]

def int_reward_func(completions, **kwargs):
    responses = [completion[0]['content'] for completion in completions]
    extracted_responses = [extract_xml_answer(r) for r in responses]
    return [0.5 if r.isdigit() else 0.0 for r in extracted_responses]

def strict_format_reward_func(completions, **kwargs):
    """
    Reward function that checks if the completion has as specific format
    """
    pattern = r"^<reasoning>.*</reasoning>\n<answer>.*?</answer>$"
    responses = [completion[0]['content'] for completion in completions]
    matches = [re.match(pattern, r, re.DOTALL) for r in responses]
    return [1.0 if match else 0.0 for match in matches]

def soft_format_reward_func(completions, **kwargs):
    """
    Reward function that checks if the completion has a specific format
    """
    pattern = r"<reasoning>.*</reasoning>\n<answer>.*?</answer>"
    responses = [completion[0]['content'] for completion in completions]
    matches = [re.match(pattern, r, re.DOTALL) for r in responses]
    return [0.5 if match else 0.0 for match in matches]

def count_xml(text):
    count = 0.0
    if text.count("<reasoning>\n") == 1:
        count+=0.125
    if text.count("\n</reasoning>\n") == 1:
        count+=0.125
    if text.count("\n<answer>\n") == 1:
        count+=0.125
        count -= len(text.split("\n</answer>\n")[-1]) * 0.001
    if text.count("\n</answer>") == 1:
        count+=0.125
        count-=(len(text.split("\n</answer>")[-1])-1)*0.001
    return count

def xml_count_reward_func(completions, **kwargs):
    contents = [completion[0]['content'] for completion in completions]
    return [count_xml(c) for c in contents]



completions = [
    [{"role": "assistant", "content": "<reasoning>\nThe sum of 1 and 2 is 3, which we multiply by 4 to get 12.\n</reasoning>\n<answer>\n(1 + 2) * 4 = 12\n</answer>"}],
    # [{"role": "assistant", "content": "The sum of 3 and 1 is 4, which we multiply by 2 to get 8. So (3 + 1) * 2 = 8."}],
]

xml_count_reward_func(completions)
# 0.382


[0.382]

In [6]:
# <reasoning>
# First, let's count the number of people in each section:

# - Orchestra section: 1 (Sebastian, the drummer)
# - Brass section: 7 people (4 trombones, 2 trumpets, 1 French horn)
# - Strings section: 5 people (3 violins, 1 cellist, 1 contrabassist)
# - Woodwinds section: 3 clarinets + 4 flutes = 7 people

# Now, let's add up the total number of people:

# Orchestra section: 1
# Brass section: 7
# Strings section: 5
# Woodwinds section: 7

# Total number of people in the orchestra: 1 + 7 + 5 + 7 = 20
# </reasoning>

# <answer>
# 20
# </answer>

text = '''<reasoning>
First, let's count the number of people in each section:

- Orchestra section: 1 (Sebastian, the drummer)
- Brass section: 7 people (4 trombones, 2 trumpets, 1 French horn)
- Strings section: 5 people (3 violins, 1 cellist, 1 contrabassist)
- Woodwinds section: 3 clarinets + 4 flutes = 7 people

Now, let's add up the total number of people:

Orchestra section: 1
Brass section: 7
Strings section: 5
Woodwinds section: 7

Total number of people in the orchestra: 1 + 7 + 5 + 7 = 20
</reasoning>
<answer>
20
</answer>'''


# def strict_format_reward_func(completions, **kwargs) -> list[float]:
#     """Reward function that checks if the completion has a specific format."""
#     # pattern = r"^<reasoning>.*?</reasoning><answer>\n.*?\n</answer>\n$"
#     pattern = r"^<reasoning>.*</reasoning>\n<answer>.*?</answer>$"
#     responses = [completion[0]["content"] for completion in completions]
#     matches = [re.match(pattern, r, re.DOTALL) for r in responses]
#     return [0.5 if match else 0.0 for match in matches]
    
completions = [
    [{"role": "assistant", "content": text}],
    # [{"role": "assistant", "content": "The sum of 3 and 1 is 4, which we multiply by 2 to get 8. So (3 + 1) * 2 = 8."}],
]

strict_format_reward_func(completions)

[1.0]

In [7]:
import re

text = '''<reasoning>
First, let's count the number of people in each section:

- Orchestra section: 1 (Sebastian, the drummer)
- Brass section: 7 people (4 trombones, 2 trumpets, 1 French horn)
- Strings section: 5 people (3 violins, 1 cellist, 1 contrabassist)
- Woodwinds section: 3 clarinets + 4 flutes = 7 people

Now, let's add up the total number of people:

Orchestra section: 1
Brass section: 7
Strings section: 5
Woodwinds section: 7

Total number of people in the orchestra: 1 + 7 + 5 + 7 = 20
</reasoning>
<answer>
20
</answer>'''

# Use re.DOTALL to ensure . matches newline characters
pattern = r"^<reasoning>.*</reasoning>\n<answer>.*?</answer>$"

match = re.match(pattern, text, re.DOTALL)

if match:
    print("Match found!")
else:
    print("No match.")

Match found!


In [8]:
from trl import GRPOConfig
param_size = "3B"
batch_size = 1
model_name = f"Qwen/Qwen2.5-{param_size}-Instruct"

output_dir=f"outputs/Qwen-{param_size}-GRPO"
run_name=f"QWEN-{param_size}-GRPO-gsm8k-1"

training_args = GRPOConfig(
    output_dir=output_dir,
    run_name=run_name,
    learning_rate=5e-6,
    adam_beta1 = 0.9,
    adam_beta2 = 0.99,
    weight_decay = 0.1,
    warmup_ratio = 0.1,
    lr_scheduler_type = 'cosine',
    logging_steps=1,
    bf16=True,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=4,
    gradient_accumulation_steps=4,
    num_generations=4,
    max_prompt_length=256,
    max_completion_length=200,
    num_train_epochs=1,
    save_steps=100,
    max_grad_norm=0.1,
    log_on_each_node=False,
    use_vllm=False,
    vllm_gpu_memory_utilization=0.3,
    vllm_device='cuda:0',
    report_to='wandb',
)

# model = AutoModelForCausalLM.from_pretrained(
#     model_name,
#     torch_dtype=torch.bfloat16,
#     device_map="auto"
# )

tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token

## train

In [None]:
from trl import GRPOTrainer
from peft import LoraConfig
import torch
from accelerate import notebook_launcher

def main():
    peft_config = LoraConfig(
        r=8,
        lora_alpha=16,
        lora_dropout=0.1,
        task_type='CAUSAL_LM'
    )
    
    
    trainer = GRPOTrainer(
        model=model_name,
        processing_class=tokenizer,
        reward_funcs=[
            xml_count_reward_func,
            soft_format_reward_func,
            strict_format_reward_func,
            int_reward_func,
            correctness_reward_func
        ],
        args=training_args,
        train_dataset=dataset,
        peft_config=peft_config,
        # peft_config
    )
    
    print('param=', param_size)
    trainer.train()
    
    trainer.save_model(output_dir)

notebook_launcher(main, args=(), num_processes=1, mixed_precision='bf16')

[2025-02-27 13:56:20,139] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect)


/home/samtang/miniconda3/envs/rl/compiler_compat/ld: cannot find -laio: No such file or directory
collect2: error: ld returned 1 exit status
/home/samtang/miniconda3/envs/rl/compiler_compat/ld: cannot find -lcufile: No such file or directory
collect2: error: ld returned 1 exit status


INFO 02-27 13:56:21 __init__.py:183] Automatically detected platform cuda.
Launching training on one GPU.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

## eval

> 1.5b
> - ***** eval metrics *****
> - eval_loss               =        0.0
> - eval_runtime            = 0:29:24.17
> - eval_samples            =       1319
> - eval_samples_per_second =      0.748
> - eval_steps_per_second   =      0.187

> 0.5b

In [9]:
eval_dataset = get_gsm8k_questions(split='test')
eval_dataset

Dataset({
    features: ['question', 'answer', 'prompt'],
    num_rows: 1319
})

In [10]:
from transformers import AutoModelForCausalLM, AutoTokenizer
from trl import GRPOTrainer

output_dir = 'outputs/Qwen-1.5B-GRPO'
model = AutoModelForCausalLM.from_pretrained(output_dir)
tokenizer = AutoTokenizer.from_pretrained(output_dir)

evaluator = GRPOTrainer(
    model=model,
    processing_class=tokenizer,
    reward_funcs=[
        correctness_reward_func
    ],
    args=training_args,
    train_dataset=dataset,
    eval_dataset=eval_dataset,
    # peft_config=peft_config,
    # peft_config
)

metrics = evaluator.evaluate()
metrics['eval_samples'] = len(eval_dataset)
evaluator.log_metrics('eval', metrics)

[2025-02-26 17:55:44,675] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect)


/home/samtang/miniconda3/envs/rl/compiler_compat/ld: cannot find -laio: No such file or directory
collect2: error: ld returned 1 exit status
/home/samtang/miniconda3/envs/rl/compiler_compat/ld: cannot find -lcufile: No such file or directory
collect2: error: ld returned 1 exit status


INFO 02-26 17:55:45 __init__.py:183] Automatically detected platform cuda.


  return forward_call(*args, **kwargs)
  return func(*args, **kwargs)


-------------------- Question:
Janet’s ducks lay 16 eggs per day. She eats three for breakfast every morning and bakes muffins for her friends every day with four. She sells the remainder at the farmers' market daily for $2 per fresh duck egg. How much in dollars does she make every day at the farmers' market?
 Answer: 18
 Extracted: 18


-------------------- Question:
Every day, Wendi feeds each of her chickens three cups of mixed chicken feed, containing seeds, mealworms and vegetables to help keep them healthy.  She gives the chickens their feed in three separate meals. In the morning, she gives her flock of chickens 15 cups of feed.  In the afternoon, she gives her chickens another 25 cups of feed.  How many cups of feed does she need to give her chickens in the final meal of the day if the size of Wendi's flock is 20 chickens?
 Answer: 20
 Extracted: 140
-------------------- Question:
Carlos is planting a lemon tree. The tree will cost $90 to plant. Each year it will grow 7 lemons, which he can sell for $1.5 each. It costs $3 a year to water and feed the tree. How many years will it take before he starts earning money on the lemon tree?
 Answer: 13
 Extracted: 10
-------------------- Question:
Two trains leave San Rafael at the same time. They begin traveling westward, both traveling for 80 miles. The next day, the

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mdruidlangde[0m ([33mdruidlangde-tencent[0m). Use [1m`wandb login --relogin`[0m to force relogin


***** eval metrics *****
  eval_loss               =        0.0
  eval_runtime            = 0:29:20.46
  eval_samples            =       1319
  eval_samples_per_second =      0.749
  eval_steps_per_second   =      0.187


## inference

In [3]:
from vllm import LLM
from transformers import AutoModelForCausalLM, AutoTokenizer

output_dir = 'outputs/Qwen-1.5B-GRPO'
# output_dir = 'Qwen/Qwen2.5-0.5B-Instruct'
model = AutoModelForCausalLM.from_pretrained(output_dir).to('cuda')
tokenizer = AutoTokenizer.from_pretrained(output_dir)

In [4]:
from transformers import TextStreamer
import torch

SYSTEM_PROMPT = """
Responde in the following format:
<reasoning>
...
</reasoning>
<answer>
...
</answer>
"""

def generate_with_stream(input_text):
    print(f"\n输入: \n{input_text}")
    print("\n输出:")

    prompts = [
        {"role":"system", "content": SYSTEM_PROMPT},
        {"role":"user", "content":input_text},
    ]
    chats = tokenizer.apply_chat_template(prompts, tokenize=False, add_generation_prompt=True)
    inputs = tokenizer(chats, return_tensors="pt").to("cuda")
    streamer = TextStreamer(tokenizer)
    
    with torch.no_grad():
        outputs = model.generate(
            inputs['input_ids'],
            max_length=512,
            num_return_sequences=1,
            temperature=0.7,
            do_sample=True,
            streamer=streamer
        )
    
    # 完整结果
    full_response = tokenizer.decode(outputs[0], skip_special_tokens=True)


# 使用
input_text = "1+3*9/2+1.2+(1+2*3)=?"
generate_with_stream(input_text)


输入: 
1+3*9/2+1.2+(1+2*3)=?

输出:
<|im_start|>system

Responde in the following format:
<reasoning>
...
</reasoning>
<answer>
...
</answer>
<|im_end|>
<|im_start|>user
1+3*9/2+1.2+(1+2*3)=?<|im_end|>
<|im_start|>assistant
<reasoning>
The expression is 1 + 3 * 9 / 2 + 1.2 + (1 + 2 * 3).
First, perform multiplication and division from left to right: 
1 + 3 * 9 / 2 + 1.2 + (1 + 6) = 
1 + 27 / 2 + 1.2 + 7.
Next, divide 27 by 2: 
1 + 13.5 + 1.2 + 7.
Finally, add all numbers together: 
1 + 13.5 + 1.2 + 7 = 22.7.
Therefore, the answer is 22.7.
</reasoning>
<answer>
22.7
</answer><|im_end|>


## test util


s0 = torch.randn(2,3,5)
s1 = torch.randn(2,3,5)

In [13]:
torch.cat([s0,s1], dim=1).shape

torch.Size([2, 6, 5])

In [14]:
torch.stack([s0, s1],dim=1).shape

torch.Size([2, 2, 3, 5])

In [15]:
from torch import nn
# 定义 LSTM
lstm = nn.LSTM(input_size=5, hidden_size=10, num_layers=1, batch_first=True)

# 生成随机输入 (batch_size=2, seq_len=3, input_size=5)
x = torch.randn(2, 3, 5)

# 前向传播
output, (hn, cn) = lstm(x)

print(f"output shape: {output.shape}")  # (2, 3, 10)
print(f"hn shape: {hn.shape}")          # (1, 2, 10)
print(f"cn shape: {cn.shape}")          # (1, 2, 10)

output shape: torch.Size([2, 3, 10])
hn shape: torch.Size([1, 2, 10])
cn shape: torch.Size([1, 2, 10])


In [1]:
from lighteval.metrics.dynamic_metrics import (
    ExprExtractionConfig,
    IndicesExtractionConfig,
    LatexExtractionConfig,
    multilingual_extractive_match_metric,
)

predictions = ["Paris", "Berlin", "Washington"]
ground_truths = ["Paris", "Berlin", "London"]

metrics = multilingual_extractive_match_metric(predictions, ground_truths)
print("Exact Match:", metrics['exact_match'])
print("F1 Score:", metrics['f1_score'])

ModuleNotFoundError: No module named 'lighteval'