In [1]:
!pip install datasets transformers trl torch tf-keras peft wandb

Defaulting to user installation because normal site-packages is not writeable


In [2]:
!pip install jinja2==3.1.0 vllm

Defaulting to user installation because normal site-packages is not writeable


In [1]:
import torch
import transformers
import trl
import jinja2
import vllm

print(f"PyTorch version: {torch.__version__}")
print(f"Transformers backend: {transformers.file_utils.is_torch_available()}")
print(f"{jinja2.__version__ }")
print(f"{vllm.__version__ }")

  from .autonotebook import tqdm as notebook_tqdm


INFO 02-22 20:38:02 __init__.py:183] Automatically detected platform cuda.


2025-02-22 20:38:02,120	INFO util.py:154 -- Missing packages: ['ipywidgets']. Run `pip install -U ipywidgets`, then restart the notebook server for rich notebook output.


PyTorch version: 2.5.1+cu124
Transformers backend: True
3.1.4
0.7.0


In [1]:
"""
Reference:

https://gist.github.com/willccbb/4676755236bb08cab5f4e54a0475d6fb
"""
import re
import torch
from datasets import load_dataset, Dataset
from transformers import AutoTokenizer, AutoModelForCausalLM

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# load and prepare ds
SYSTEM_PROMPT = """
Responde in the following format:
<reasoning>
...
</reasoning>
<answer>
...
</answer>
"""

XML_COT_FORMAT = """\
<reasoning>
{reasoning}
</reasoning>
<answer>
{answer}
</answer>
"""

In [3]:
def extract_xml_answer(text:str):
    answer = text.split("<answer>")[-1]
    answer = answer.split("</answer>")[0]
    return answer.strip()

def extract_hash_answer(text:str):
    if '####' not in text:
        return None
    return text.split('####')[1].strip()

def get_gsm8k_questions(split='train'):
    data = load_dataset('openai/gsm8k', 'main')[split]
    data = data.map(lambda x: {
        'prompt':[
            {'role':'system', 'content': SYSTEM_PROMPT},
            {'role':'user', 'content': x['question']}
        ],
        'answer': extract_hash_answer(x['answer'])
    }) # type: ignore
    return data
print(extract_xml_answer("<answer>123</answer>"))
print(extract_hash_answer('12 #### asas'))

123
asas


In [4]:
dataset = get_gsm8k_questions()
dataset

Dataset({
    features: ['question', 'answer', 'prompt'],
    num_rows: 7473
})

In [5]:
# step = 0
# reward functions
def correctness_reward_func(prompts, completions, answer, **kwargs):
    responses = [completion[0]['content'] for completion in completions]
    q = prompts[0][-1]['content'] # [{role:system},{role:user},{role:assistance}]
    extracted_responses = [extract_xml_answer(r) for r in responses]
    if '</answer>' in responses[0] and '<answer>' in responses[0] and '<reasoning>' in responses[0] and '</reasoning>' in responses[0]:
        print('-'*20, f"Question:\n{q}\n", '-'*20, f"Answer:\n{answer[0]}\n", '-'*20, f"Response:\n{responses[0]}\n", '-'*20, f"Extracted:\n{extracted_responses[0]}",'\n\n')
    return [2.0 if r==a else 0.0 for r,a in zip(extracted_responses, answer)]

def int_reward_func(completions, **kwargs):
    responses = [completion[0]['content'] for completion in completions]
    extracted_responses = [extract_xml_answer(r) for r in responses]
    return [0.5 if r.isdigit() else 0.0 for r in extracted_responses]

def strict_format_reward_func(completions, **kwargs):
    """
    Reward function that checks if the completion has as specific format
    """
    pattern = r"^<reasoning>.*</reasoning>\n<answer>.*?</answer>$"
    responses = [completion[0]['content'] for completion in completions]
    matches = [re.match(pattern, r, re.DOTALL) for r in responses]
    return [0.5 if match else 0.0 for match in matches]

def soft_format_reward_func(completions, **kwargs):
    """
    Reward function that checks if the completion has a specific format
    """
    pattern = r"<reasoning>.*</reasoning>\n<answer>.*?</answer>"
    responses = [completion[0]['content'] for completion in completions]
    matches = [re.match(pattern, r, re.DOTALL) for r in responses]
    return [0.5 if match else 0.0 for match in matches]

def count_xml(text):
    count = 0.0
    if text.count("<reasoning>\n") == 1:
        count+=0.125
    if text.count("\n</reasoning>\n") == 1:
        count+=0.125
    if text.count("\n<answer>\n") == 1:
        count+=0.125
        count -= len(text.split("\n</answer>\n")[-1]) * 0.001
    if text.count("\n</answer>") == 1:
        count+=0.125
        count-=(len(text.split("\n</answer>")[-1])-1)*0.001
    return count

def xml_count_reward_func(completions, **kwargs):
    contents = [completion[0]['content'] for completion in completions]
    return [count_xml(c) for c in contents]



completions = [
    [{"role": "assistant", "content": "<reasoning>\nThe sum of 1 and 2 is 3, which we multiply by 4 to get 12.\n</reasoning>\n<answer>\n(1 + 2) * 4 = 12\n</answer>"}],
    # [{"role": "assistant", "content": "The sum of 3 and 1 is 4, which we multiply by 2 to get 8. So (3 + 1) * 2 = 8."}],
]

xml_count_reward_func(completions)
# 0.382


[0.382]

In [6]:
# <reasoning>
# First, let's count the number of people in each section:

# - Orchestra section: 1 (Sebastian, the drummer)
# - Brass section: 7 people (4 trombones, 2 trumpets, 1 French horn)
# - Strings section: 5 people (3 violins, 1 cellist, 1 contrabassist)
# - Woodwinds section: 3 clarinets + 4 flutes = 7 people

# Now, let's add up the total number of people:

# Orchestra section: 1
# Brass section: 7
# Strings section: 5
# Woodwinds section: 7

# Total number of people in the orchestra: 1 + 7 + 5 + 7 = 20
# </reasoning>

# <answer>
# 20
# </answer>

text = '''<reasoning>
First, let's count the number of people in each section:

- Orchestra section: 1 (Sebastian, the drummer)
- Brass section: 7 people (4 trombones, 2 trumpets, 1 French horn)
- Strings section: 5 people (3 violins, 1 cellist, 1 contrabassist)
- Woodwinds section: 3 clarinets + 4 flutes = 7 people

Now, let's add up the total number of people:

Orchestra section: 1
Brass section: 7
Strings section: 5
Woodwinds section: 7

Total number of people in the orchestra: 1 + 7 + 5 + 7 = 20
</reasoning>
<answer>
20
</answer>'''


# def strict_format_reward_func(completions, **kwargs) -> list[float]:
#     """Reward function that checks if the completion has a specific format."""
#     # pattern = r"^<reasoning>.*?</reasoning><answer>\n.*?\n</answer>\n$"
#     pattern = r"^<reasoning>.*</reasoning>\n<answer>.*?</answer>$"
#     responses = [completion[0]["content"] for completion in completions]
#     matches = [re.match(pattern, r, re.DOTALL) for r in responses]
#     return [0.5 if match else 0.0 for match in matches]
    
completions = [
    [{"role": "assistant", "content": text}],
    # [{"role": "assistant", "content": "The sum of 3 and 1 is 4, which we multiply by 2 to get 8. So (3 + 1) * 2 = 8."}],
]

strict_format_reward_func(completions)

[0.5]

In [7]:
import re

text = '''<reasoning>
First, let's count the number of people in each section:

- Orchestra section: 1 (Sebastian, the drummer)
- Brass section: 7 people (4 trombones, 2 trumpets, 1 French horn)
- Strings section: 5 people (3 violins, 1 cellist, 1 contrabassist)
- Woodwinds section: 3 clarinets + 4 flutes = 7 people

Now, let's add up the total number of people:

Orchestra section: 1
Brass section: 7
Strings section: 5
Woodwinds section: 7

Total number of people in the orchestra: 1 + 7 + 5 + 7 = 20
</reasoning>
<answer>
20
</answer>'''

# Use re.DOTALL to ensure . matches newline characters
pattern = r"^<reasoning>.*</reasoning>\n<answer>.*?</answer>$"

match = re.match(pattern, text, re.DOTALL)

if match:
    print("Match found!")
else:
    print("No match.")

Match found!


In [8]:
from trl import GRPOConfig
param_size = "1.5B"
model_name = f"Qwen/Qwen2.5-{param_size}-Instruct"

output_dir=f"outputs/Qwen-{param_size}-GRPO"
run_name=f"QWEN-{param_size}-GRPO-gsm8k-1"

training_args = GRPOConfig(
    output_dir=output_dir,
    run_name=run_name,
    learning_rate=5e-6,
    adam_beta1 = 0.9,
    adam_beta2 = 0.99,
    weight_decay = 0.1,
    warmup_ratio = 0.1,
    lr_scheduler_type = 'cosine',
    logging_steps=1,
    bf16=True,
    per_device_train_batch_size=1,
    gradient_accumulation_steps=4,
    num_generations=12,
    max_prompt_length=256,
    max_completion_length=200,
    num_train_epochs=1,
    save_steps=100,
    max_grad_norm=0.1,
    log_on_each_node=False,
    use_vllm=False,
    vllm_gpu_memory_utilization=0.3,
    vllm_device='cuda:0',
    report_to='wandb',
)

# model = AutoModelForCausalLM.from_pretrained(
#     model_name,
#     torch_dtype=torch.bfloat16,
#     device_map="auto"
# )

tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token

## train

In [9]:
from trl import GRPOTrainer
from peft import LoraConfig
import torch

peft_config = LoraConfig(
    r=8,
    lora_alpha=16,
    lora_dropout=0.1,
    task_type='CAUSAL_LM'
)


trainer = GRPOTrainer(
    model=model_name,
    processing_class=tokenizer,
    reward_funcs=[
        xml_count_reward_func,
        soft_format_reward_func,
        strict_format_reward_func,
        int_reward_func,
        correctness_reward_func
    ],
    args=training_args,
    train_dataset=dataset,
    peft_config=peft_config,
    # peft_config
)

print('param=', param_size)
trainer.train()

trainer.save_model(output_dir)

[2025-02-22 21:10:46,403] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect)


/home/samtang/miniconda3/envs/rl/compiler_compat/ld: cannot find -laio: No such file or directory
collect2: error: ld returned 1 exit status
/home/samtang/miniconda3/envs/rl/compiler_compat/ld: cannot find -lcufile: No such file or directory
collect2: error: ld returned 1 exit status


INFO 02-22 21:10:46 __init__.py:183] Automatically detected platform cuda.


2025-02-22 21:10:47,022	INFO util.py:154 -- Missing packages: ['ipywidgets']. Run `pip install -U ipywidgets`, then restart the notebook server for rich notebook output.


param= 1.5B


Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mdruidlangde[0m ([33mdruidlangde-tencent[0m). Use [1m`wandb login --relogin`[0m to force relogin


  return self.model.forward(*args, **kwargs)


OutOfMemoryError: CUDA out of memory. Tried to allocate 932.00 MiB. GPU 0 has a total capacity of 23.61 GiB of which 621.50 MiB is free. Including non-PyTorch memory, this process has 21.73 GiB memory in use. Of the allocated memory 20.58 GiB is allocated by PyTorch, and 719.20 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

## inference

In [8]:
from vllm import LLM
from transformers import AutoModelForCausalLM, AutoTokenizer

output_dir = 'outputs/Qwen-0.5B-GRPO'
model = AutoModelForCausalLM.from_pretrained(output_dir)
model.to('cuda')

Qwen2ForCausalLM(
  (model): Qwen2Model(
    (embed_tokens): Embedding(151936, 896)
    (layers): ModuleList(
      (0-23): 24 x Qwen2DecoderLayer(
        (self_attn): Qwen2Attention(
          (q_proj): Linear(in_features=896, out_features=896, bias=True)
          (k_proj): Linear(in_features=896, out_features=128, bias=True)
          (v_proj): Linear(in_features=896, out_features=128, bias=True)
          (o_proj): Linear(in_features=896, out_features=896, bias=False)
        )
        (mlp): Qwen2MLP(
          (gate_proj): Linear(in_features=896, out_features=4864, bias=False)
          (up_proj): Linear(in_features=896, out_features=4864, bias=False)
          (down_proj): Linear(in_features=4864, out_features=896, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): Qwen2RMSNorm((896,), eps=1e-06)
        (post_attention_layernorm): Qwen2RMSNorm((896,), eps=1e-06)
      )
    )
    (norm): Qwen2RMSNorm((896,), eps=1e-06)
    (rotary_emb): Qwen2RotaryEmbe

In [9]:
tokenizer = AutoTokenizer.from_pretrained(output_dir)

In [18]:
from transformers import TextStreamer
import torch

def generate_with_stream(input_text):
    print(f"\n输入: \n{input_text}")
    print("\n输出:")
    
    inputs = tokenizer(input_text, return_tensors="pt").to("cuda")
    streamer = TextStreamer(tokenizer)
    
    with torch.no_grad():
        outputs = model.generate(
            inputs["input_ids"],
            max_length=512,
            num_return_sequences=1,
            temperature=0.7,
            do_sample=True,
            streamer=streamer
        )
    
    # 完整结果
    full_response = tokenizer.decode(outputs[0], skip_special_tokens=True)

# 使用
input_text = "Dave bought 8 books about animals, 6 books about outer space, and 3 books about trains to keep him busy over the holidays. Each book cost $6. How much did Dave spend on the books?"
generate_with_stream(input_text)


输入: 
Dave bought 8 books about animals, 6 books about outer space, and 3 books about trains to keep him busy over the holidays. Each book cost $6. How much did Dave spend on the books?

输出:
books? To determine how much Dave spent on the books, we first need to calculate the total quantity of books he acquired, then multiply by each individual price.pend on the 

10 books are devoted to Animals: 
 4 books regarding animal, 6 Books is (9)
 Total books = sum total book

20 books about outer Space
4 6Books is 12
Total books = 20 books

3 books About Train= 3  5 books  3(20) 

Calculate total books = 3( 3 3)= 7 20

Therefore, Total books amount to 7 4 5 5 6 = 60. Verify  60 Book  6 Cost 10 books per 6. Verification 60 books 20 cost  6 3 12

In the 20 book total books, the net cost of 20 units per $6 is the whole set 20 x 6 cost  6

The total spending in the books =  10 30  3.30 202.40

 20.40  60 total book =   60 20  12  40 点

Thus, Dave  spends  元于 books.

Final answers. 120, 00  高 延 怀 一

In [11]:
s0 = torch.randn(2,3,5)
s1 = torch.randn(2,3,5)

In [13]:
torch.cat([s0,s1], dim=1).shape

torch.Size([2, 6, 5])

In [14]:
torch.stack([s0, s1],dim=1).shape

torch.Size([2, 2, 3, 5])

In [15]:
from torch import nn
# 定义 LSTM
lstm = nn.LSTM(input_size=5, hidden_size=10, num_layers=1, batch_first=True)

# 生成随机输入 (batch_size=2, seq_len=3, input_size=5)
x = torch.randn(2, 3, 5)

# 前向传播
output, (hn, cn) = lstm(x)

print(f"output shape: {output.shape}")  # (2, 3, 10)
print(f"hn shape: {hn.shape}")          # (1, 2, 10)
print(f"cn shape: {cn.shape}")          # (1, 2, 10)

output shape: torch.Size([2, 3, 10])
hn shape: torch.Size([1, 2, 10])
cn shape: torch.Size([1, 2, 10])
