In [7]:
from transformers import AutoTokenizer, AutoModelForCausalLM, PreTrainedTokenizerFast, PreTrainedModel
from transformers import Trainer, TrainingArguments
from trl import SFTConfig, SFTTrainer
from trl import setup_chat_format
from peft import get_peft_model, LoraConfig, TaskType
from datasets import load_dataset, Dataset
from concurrent.futures import ThreadPoolExecutor
from trl import DataCollatorForCompletionOnlyLM
import torch
from vllm import LLM, SamplingParams
from vllm.model_executor import set_random_seed as vllm_set_random_seed
from drgrpo_grader import r1_zero_reward_fn
import gc
from unittest.mock import patch
import wandb
import safetensors
import os
import json
import numpy as np
model_name = "Qwen/Qwen2.5-1.5B"
tokenizer = AutoTokenizer.from_pretrained(model_name)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.bfloat16,
    attn_implementation="flash_attention_2",
    ).to(device)
tokenizer.padding_side = "left"
# 1) Add a new [PAD] token
tokenizer.add_special_tokens({"pad_token": "[PAD]"})

# 2) Resize the model’s embeddings to match the new tokenizer
model.resize_token_embeddings(len(tokenizer))

# by default, truncate from left side, and sacrifice prompt
# Make sure your tokenizer knows about <|im_end|> already:
#   (Qwen’s tokenizer has <|im_end|> in additional_special_tokens, dont need to create id, only find it through convert_tokens_to_ids)

# 1) Look up the ID of the built-in <|im_end|> token:
im_end_id = tokenizer.convert_tokens_to_ids("<|im_end|>")

# 2) Tell the tokenizer to use that as its pad token:
#tokenizer.pad_token = "<|im_end|>"
tokenizer.eos_token_id = im_end_id

ds = load_dataset("openai/gsm8k", "main")

def preprocess_dataset(ds, usage):
    questions, answers =  ds[usage]["question"], ds[usage]["answer"]
    with open("prompts/r1_zero.prompt", "r", encoding="utf-8") as f:
        prompt_string = f.read()

    def process_question(q):
        return prompt_string.format(question=q)
    def process_ground_truth(ans):
        return ans.split('\n#### ')[1]
    def process_prompt_completion(q, ans):
        prompt = prompt_string.format(question=q)
        cot =' ' + ans.split('\n#### ')[0] + ' </think>'
        gt = f" <answer> {ans.split('\n#### ')[1]} </answer>"
        return prompt + cot + gt
    with ThreadPoolExecutor() as executor:
        question_prompts = list(executor.map(process_question, ds[usage]["question"]))
    with ThreadPoolExecutor() as executor:
        ground_truth = list(executor.map(process_ground_truth, ds[usage]["answer"]))
    with ThreadPoolExecutor() as executor:
        prompt_completion = list(executor.map(process_prompt_completion, ds[usage]["question"], ds[usage]["answer"]))
    return question_prompts, ground_truth, prompt_completion

training_data = preprocess_dataset(ds, 'train')[2]
test_prompt, test_gt =  preprocess_dataset(ds, 'train')[0], preprocess_dataset(ds, 'train')[1]
train_ds = Dataset.from_dict({"text": preprocess_dataset(ds, 'train')[2]})
val_ds = Dataset.from_dict({
    "prompt": test_prompt[:len(test_prompt)//2],
    "gt": test_gt[:len(test_gt)//2],

})
test_ds = Dataset.from_dict({
    "prompt": test_prompt[len(test_prompt)//2:],
    "gt": test_gt[len(test_gt)//2:],

})

# Build a collator whose response_template matches your prompt ending

collator = DataCollatorForCompletionOnlyLM(
    tokenizer = tokenizer,
    # Anything before *and including* this string gets label = -100
    response_template  = r"Assistant: <think>",   # note the space after >
    
)


#preprocess_dataset(ds)[2][0:2]

# some test on collator
# curr_batch = tokenizer(
#     preprocess_dataset(ds)[2][0:2],
#     padding=False,               # collator can handle this , set to false
#     truncation=False,           # collator will handle this with left truncate
#     return_special_tokens_mask=False   # only need for MLM task
#     #return_tensors=None        # lists, not tensors – collator wants lists
# )

# # #collator format [[dict] , [dict], [dict]]
# cnt = len(curr_batch['input_ids'])

# curr_batch_reconstruct =[]
# for i in range(cnt):
#     temp = dict()
#     for k,vals in curr_batch.items():
#         temp[k] = vals[i]
#     curr_batch_reconstruct.append(temp)




# collator_output_dict = collator.torch_call(curr_batch_reconstruct)
# collator_output_dict['labels'][1]
# print(collator_output_dict['input_ids'][0][128:])
# print(collator_output_dict['labels'][0][128:])
# print(len(collator_output_dict['input_ids'][0]))
# print(len(collator_output_dict['labels'][0]))
# # # print(tokenizer.decode(collator_output_dict['input_ids'][1]))
# # # print(tokenizer.decode([42 if v==-100 else v for v in collator_output_dict['labels'][1].tolist() ]))


# # #collator([12,33])
# # #print(curr_batch.keys())
# # #len(curr_batch['input_ids'][0]), len(curr_batch['input_ids'][1])    
# # #curr_batch['attention_mask'][1] # the second mask has 1 digit 0

Using the latest cached version of the dataset since openai/gsm8k couldn't be found on the Hugging Face Hub
Found the latest cached dataset configuration 'main' at /home/sagemaker-user/.cache/huggingface/datasets/openai___gsm8k/main/0.0.0/e53f048856ff4f594e959d75785d2c2d37b678ee (last modified on Mon Jun  2 05:00:15 2025).


In [6]:
#tokenizer.decode([151645])
#train_ds['text']
tokenizer.pad_token_id
#tokenizer.eos_token_id
#val_ds['prompt']
# print(training_data[0])
#print(val_ds['prompt'][0])
# print(ds['train']['question'][0])
# print(ds['test']['question'][0])
test_prompt

['A conversation between User and Assistant. The User asks a question, and the Assistant solves it. The Assistant first thinks about the reasoning process in the mind and then provides the User with the answer. The reasoning process is enclosed within <think> </think> and answer is enclosed within <answer> </answer> tags, respectively, i.e., <think> reasoning process here </think> <answer> answer here </answer>.\nUser: Natalia sold clips to 48 of her friends in April, and then she sold half as many clips in May. How many clips did Natalia sell altogether in April and May?\nAssistant: <think>',
 'A conversation between User and Assistant. The User asks a question, and the Assistant solves it. The Assistant first thinks about the reasoning process in the mind and then provides the User with the answer. The reasoning process is enclosed within <think> </think> and answer is enclosed within <answer> </answer> tags, respectively, i.e., <think> reasoning process here </think> <answer> answer

In [4]:

#model = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto")



sft_config = SFTConfig(
    max_seq_length=1024,
    output_dir="./checkpoint/sft/sft_lora_results2",
    per_device_train_batch_size=2,
    gradient_accumulation_steps=8,
    learning_rate=1e-5,  
    num_train_epochs=3,
   
    logging_steps=10,
    label_names=["labels"],
    warmup_ratio=0.03,
    report_to = "wandb",  
    bf16=True,   
    #pad_token_id=eos_id,
    #eos_token_id=eos_id,          # <— this is what TRL will use to stop
    # you can also set other generation defaults here if you like
)


# LoRA 配置
lora_config = LoraConfig(
    task_type=TaskType.CAUSAL_LM,
    inference_mode=False,
    r=16,  
    lora_alpha=32,
    lora_dropout=0.1,
   
    modules_to_save=["embed_tokens", "lm_head"],
    #target_modules='all-linear'
    target_modules = ["q_proj","v_proj"]
)

# 将LoRA配置应用到模型
peft_model = get_peft_model(model, lora_config)




# # 训练参数配置
# training_args = TrainingArguments(
#     output_dir="./checkpoint/sft/sft_lora_results",
#     per_device_train_batch_size=2,
#     gradient_accumulation_steps=8,
#     learning_rate=1e-4,  
#     num_train_epochs=3,
#     fp16=True,  
#     logging_steps=1
# )

# 使用Trainer API进行训练
trainer = SFTTrainer(
    model=peft_model,
    #args=training_args,
    train_dataset=train_ds,
    data_collator=collator, 
    args=sft_config,
    #data_collator=torch.utils.data.DataCollatorWithPadding(tokenizer=tokenizer)
)

trainer.train()

Converting train dataset to ChatML:   0%|          | 0/7473 [00:00<?, ? examples/s]

Adding EOS to train dataset:   0%|          | 0/7473 [00:00<?, ? examples/s]

Tokenizing train dataset:   0%|          | 0/7473 [00:00<?, ? examples/s]

Truncating train dataset:   0%|          | 0/7473 [00:00<?, ? examples/s]

[34m[1mwandb[0m: Currently logged in as: [33mhaoranyu66[0m ([33mudacity_jeff[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Step,Training Loss
10,0.5243
20,0.5833
30,0.5207
40,0.5215
50,0.5278
60,0.5113
70,0.4727
80,0.4668
90,0.4658
100,0.4399




TrainOutput(global_step=1404, training_loss=0.3511641578287141, metrics={'train_runtime': 2114.3989, 'train_samples_per_second': 10.603, 'train_steps_per_second': 0.664, 'total_flos': 6.644537586695578e+16, 'train_loss': 0.3511641578287141})

In [2]:

def init_vllm(model_id: str, tokenizer, device: str, seed: int, gpu_memory_utilization: float = 0.85):
    """Start the inference process, here we use vLLM to hold a model on
    a GPU separate from the policy.
    """
    vllm_set_random_seed(seed)
    # Monkeypatch from TRL:
    # https://github.com/huggingface/trl/blob/
    # 22759c820867c8659d00082ba8cf004e963873c1/trl/trainer/grpo_trainer.py
    # Patch vLLM to make sure we can
    # (1) place the vLLM model on the desired device (world_size_patch) and
    # (2) avoid a test that is not designed for our setting (profiling_patch).
    world_size_patch = patch("torch.distributed.get_world_size", return_value=1)
    profiling_patch = patch(
    "vllm.worker.worker.Worker._assert_memory_footprint_increased_during_profiling",
    return_value=None
    )
    with world_size_patch, profiling_patch:
        return LLM(
        model=model_id,
        tokenizer=tokenizer,
        tokenizer_mode='auto',
        device=device,
        dtype=torch.bfloat16,
        enable_prefix_caching=True,
        gpu_memory_utilization=gpu_memory_utilization,
        
        )

def load_policy_into_vllm_instance(policy: PreTrainedModel, tokenizer, llm: LLM):
    """ Copied from https://github.com/huggingface/trl/blob/
    22759c820867c8659d00082ba8cf004e963873c1/trl/trainer/grpo_trainer.py#L670.
    """
    state_dict = policy.state_dict()
    llm_model = llm.llm_engine.model_executor.driver_worker.model_runner.model
    llm_model.load_weights(state_dict.items())
    #llm.llm_engine.tokenizer = tokenizer

def sft_evaluation(policy: PreTrainedModel, tokenizer, val_ds, model_id: str, policy_device: str, llm_device: str, out_dir):
    start_time = time.time()
    #format_reward, answer_reward, answer = [], [], []
    # total_response_len = 0
    # total_response_len_correct = 0
    # total_response_len_incorrect = 0
    # total_sample = 0
    # total_correct_sample = 0
    # total_incorrect_sample = 0
    response_avg_entropy_lst = [] #(batch,)
    response_len_lst = [] #(batch,)
    correct_lst = []
    incorrect_lst = []
    #initialize llm for vllm
    llm = init_vllm(model_id, tokenizer, llm_device, 233)
    load_policy_into_vllm_instance(policy, tokenizer, llm)
    sampling_params = SamplingParams(
    temperature=1.0, top_p=1.0, max_tokens=1024, stop=["</answer>"]
    )
    sampling_params.include_stop_str_in_output = True
    all_prompt_texts = val_ds['prompt']
    all_answer_gt = val_ds['gt']
    # for batch in val_dataset:
    #     all_prompt_texts.extend(batch['prompt_texts'])
    #     all_answer_gt.extend(batch['answer_gt'] ) 
   
    all_model_outputs = llm.generate(all_prompt_texts, sampling_params)
    start2_time = time.time()
    print(f'generate all response time: {start2_time - start_time}------------')

    # still need policy model for eval mode
    #policy.eval()
    os.makedirs(out_dir.rsplit('/', 1)[0], exist_ok=True) 
    with open(out_dir, "a", encoding="utf-8") as f:
        # for idx, batch in enumerate(tqdm(dataloader, total=len(dataloader))):
        #     #start2_time = time.time()
        #     #prompt_texts = batch['prompt_texts']
        #     #answer_texts = batch['answer_texts'] 
        #     #answer_gt = batch['answer_gt'] 
            
        #     #batch_size = batch['input_ids'].size(0)
        #     input_ids = batch['input_ids'].to(policy_device)
        #     labels = batch['labels'].to(policy_device)
        #     response_masks = batch['response_mask'].to(policy_device)
        #     with torch.no_grad():
        #         response = get_response_log_probs(policy, input_ids, labels, True)
        #         log_probs, token_entropy = response['log_probs'], response['token_entropy']
        #         normalzied_token_entropy = token_entropy * response_masks
                
        #         curr_batch_response_len = torch.sum(response_masks,dim=1)#.tolist()
        #         curr_batch_response_entropy = torch.sum(normalzied_token_entropy, dim=1)#.tolist()
        #         curr_batch_response_avg_entropy = curr_batch_response_entropy / curr_batch_response_len
                

        #         curr_batch_response_len = curr_batch_response_len.tolist()
        #         curr_batch_response_avg_entropy = curr_batch_response_avg_entropy.tolist()
        #         response_len_lst.extend(curr_batch_response_len)
        #         response_avg_entropy_lst.extend(curr_batch_response_avg_entropy)
        #         # print('curr_batch_response_len', curr_batch_response_len)
        #         # print('curr_batch_response_avg_entropy', curr_batch_response_avg_entropy)
                



        #     # # Generate texts from the prompts. The output is a list of RequestOutput objects
        #     # # that contain the prompt, generated text, and other information.
        #     # stop1_time = time.time()
        #     # model_outputs = llm.generate(prompt_texts, sampling_params)
        #     # stop2_time = time.time()

        for i, (output, gt) in enumerate(zip(all_model_outputs, all_answer_gt)):
            prompt = output.prompt
            generated_text = output.outputs[0].text
            
            res = r1_zero_reward_fn(generated_text, gt)
            # format_reward+= res['format_reward']
            # answer_reward+= res['answer_reward']
            # reward+= res['reward']
            #print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}, format_reward: {str(res['format_reward'])}, answer_reward: {str(res['answer_reward'])}, reward: {str(res['reward'])}")
            #final_output.append([res['format_reward'], res['answer_reward'], res['reward']])
            dp = {
                "prompt": f"{prompt}",
                "ground_truth": gt, 
                "output": f"{generated_text}",
                "format_reward": res['format_reward'],
                "answer_reward": res['answer_reward'],
                "reward": res['reward'],
                #"avg_response_token_entropy": response_avg_entropy_lst[i]
            }
            correct_lst.append(int(res['reward']==1))
            incorrect_lst.append(int(res['reward']!=1))

            json.dump(dp, f, ensure_ascii=False)
            
    #total_response_len_correct = np.sum(np.array(correct_lst) * np.array(response_len_lst))
    #total_response_len_incorrect = np.sum(np.array(incorrect_lst) * np.array(response_len_lst))
    total_correct_sample = np.sum(np.array(correct_lst))
    total_incorrect_sample = np.sum(np.array(incorrect_lst))
    total_sample = len(all_model_outputs)
    total_response_len = np.sum(np.array(response_len_lst))

    print(f'total_correct_sample: {total_correct_sample}, total_incorrect_sample: {total_incorrect_sample}')
    #print(f'final stats\navg_response_len={total_response_len/total_sample:.2f}, avg_response_len_correct={total_response_len_correct/total_correct_sample:.2f}, avg_response_len_incorrect={total_response_len_incorrect/total_incorrect_sample:.2f}')
    #print("some check:", total_response_len_correct+total_response_len_incorrect==total_response_len)
    #print("moer check:", total_correct_sample+total_incorrect_sample==total_sample)
    import gc
    del llm

    torch.cuda.empty_cache()
    gc.collect()
    return total_correct_sample / total_sample


In [4]:
import time
model_name = "Qwen/Qwen2.5-1.5B"
tokenizer_dir =  "./checkpoint/sft/sft_lora_results2/checkpoint-1000"
model_dir =      "checkpoint/sft/sft_lora_results2/checkpoint-1000/adapter_model.safetensors"
eval_out_dir = 'eval/sft/sft_lora_results2/checkpoint-1000/sft_eval.jsonl'
tokenizer =  AutoTokenizer.from_pretrained(tokenizer_dir, local_files_only=True)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.bfloat16,
    attn_implementation="flash_attention_2",
    ).to(device)
# 2) Load only the adapter weights into the base model
adapter_sd = safetensors.torch.load_file(
    model_dir,
    device="cpu"
)
model.load_state_dict(adapter_sd, strict=False)
sft_evaluation(model, tokenizer_dir, val_ds, model_name, device, device, eval_out_dir)

INFO 06-08 18:05:52 config.py:542] This model supports multiple tasks: {'embed', 'reward', 'classify', 'generate', 'score'}. Defaulting to 'generate'.
INFO 06-08 18:05:52 llm_engine.py:234] Initializing a V0 LLM engine (v0.7.2) with config: model='Qwen/Qwen2.5-1.5B', speculative_config=None, tokenizer='./checkpoint/sft/sft_lora_results2/checkpoint-1000', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=131072, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto,  device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='xgrammar'), observability_config=ObservabilityConfig(otlp_traces_endpoint=None, collect_model_forward_time=False, collect_model_execute_time=False), seed=0, served_model_name=Qwen/Qwen2.5-1.

Loading safetensors checkpoint shards:   0% Completed | 0/1 [00:00<?, ?it/s]


INFO 06-08 18:05:54 model_runner.py:1115] Loading model weights took 2.8797 GB
INFO 06-08 18:06:00 worker.py:267] Memory profiling takes 6.25 seconds
INFO 06-08 18:06:00 worker.py:267] the current vLLM instance can use total_gpu_memory (21.98GiB) x gpu_memory_utilization (0.85) = 18.68GiB
INFO 06-08 18:06:00 worker.py:267] model weights take 2.88GiB; non_torch_memory takes 0.00GiB; PyTorch activation peak memory takes 8.07GiB; the rest of the memory reserved for KV Cache is 7.73GiB.
INFO 06-08 18:06:00 executor_base.py:110] # CUDA blocks: 18101, # CPU blocks: 9362
INFO 06-08 18:06:00 executor_base.py:115] Maximum concurrency for 131072 tokens per request: 2.21x
INFO 06-08 18:06:01 model_runner.py:1434] Capturing cudagraphs for decoding. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI. If out-of-memory error occurs during cudagraph capture, consider decreasing `gpu_memory_ut

Capturing CUDA graph shapes: 100%|██████████| 35/35 [00:18<00:00,  1.94it/s]

INFO 06-08 18:06:19 model_runner.py:1562] Graph capturing finished in 18 secs, took 0.05 GiB
INFO 06-08 18:06:19 llm_engine.py:431] init engine (profile, create kv cache, warmup model) took 24.99 seconds



Processed prompts: 100%|██████████| 3736/3736 [02:21<00:00, 26.31it/s, est. speed input: 4000.72 toks/s, output: 4125.48 toks/s]


generate all response time: 171.94753980636597------------
total_correct_sample: 259, total_incorrect_sample: 3477


0.0693254817987152

In [10]:
# base model
model_name = "Qwen/Qwen2.5-1.5B"
eval_out_dir = 'eval/sft/sft_lora_results2/base_model/sft_eval.jsonl'
llm = LLM(model_name, dtype="bfloat16")
# Create a sampling params object, stopping generation on newline.
sampling_params = SamplingParams(
temperature=1.0, top_p=1.0, max_tokens=1024, stop=["</answer>"]
)
sampling_params.include_stop_str_in_output = True

# Generate texts from the prompts. The output is a list of RequestOutput objects
# that contain the prompt, generated text, and other information.
model_outputs = llm.generate(question_prompts, sampling_params)
all_prompt_texts = val_ds['prompt']
all_answer_gt = val_ds['gt']


all_model_outputs = llm.generate(all_prompt_texts, sampling_params)
start2_time = time.time()
print(f'generate all response time: {start2_time - start_time}------------')

# still need policy model for eval mode
#policy.eval()
os.makedirs(eval_out_dir.rsplit('/', 1)[0], exist_ok=True) 
with open(eval_out_dir, "a", encoding="utf-8") as f:
    for i, (output, gt) in enumerate(zip(all_model_outputs, all_answer_gt)):
        prompt = output.prompt
        generated_text = output.outputs[0].text
        
        res = r1_zero_reward_fn(generated_text, gt)
        # format_reward+= res['format_reward']
        # answer_reward+= res['answer_reward']
        # reward+= res['reward']
        #print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}, format_reward: {str(res['format_reward'])}, answer_reward: {str(res['answer_reward'])}, reward: {str(res['reward'])}")
        #final_output.append([res['format_reward'], res['answer_reward'], res['reward']])
        dp = {
            "prompt": f"{prompt}",
            "ground_truth": gt, 
            "output": f"{generated_text}",
            "format_reward": res['format_reward'],
            "answer_reward": res['answer_reward'],
            "reward": res['reward'],
            #"avg_response_token_entropy": response_avg_entropy_lst[i]
        }
        correct_lst.append(int(res['reward']==1))
        incorrect_lst.append(int(res['reward']!=1))

        json.dump(dp, f, ensure_ascii=False)
            
#total_response_len_correct = np.sum(np.array(correct_lst) * np.array(response_len_lst))
#total_response_len_incorrect = np.sum(np.array(incorrect_lst) * np.array(response_len_lst))
total_correct_sample = np.sum(np.array(correct_lst))
total_incorrect_sample = np.sum(np.array(incorrect_lst))
total_sample = len(all_model_outputs)
total_response_len = np.sum(np.array(response_len_lst))

print(f'total_correct_sample: {total_correct_sample}, total_incorrect_sample: {total_incorrect_sample}')
    

ReadTimeout: (ReadTimeoutError("HTTPSConnectionPool(host='huggingface.co', port=443): Read timed out. (read timeout=10)"), '(Request ID: 3fdf46f4-db62-4f95-9f04-73f0bc5d6c46)')

In [9]:
output = " We'll use cancellation technique to solve this problem. Natalia sold half as many clips in may as she had in April. That means she sold 12 clips in May since 48 / 2 = 24. Note that we must never cancel the first 24/, as it is 0. Our answer is 48+24 = 72 clips.</think>\n\n<answer> 72 </answer>"
r1_zero_reward_fn(output, '72')



True

In [11]:
from transformers import AutoTokenizer, AutoModelForCausalLM

model_name = "Qwen/Qwen2.5-1.5B"
tokenizer = AutoTokenizer.from_pretrained(model_name)
#tokenizer.decode(tokenizer.pad_token_id)

messages = [
    {"role": "system", "content": "You are a friendly chatbot who always responds in the style of a pirate",},
    {"role": "user", "content": "How many helicopters can a human eat in one sitting?"},
 ]
tokenized_chat = tokenizer.apply_chat_template(messages, tokenize=True, add_generation_prompt=True, return_tensors="pt")
tokenized_chat


tensor([[151644,   8948,    198,   2610,    525,    264,  11657,   6236,   6331,
            879,   2677,  30580,    304,    279,   1707,    315,    264,  53966,
         151645,    198, 151644,    872,    198,   4340,   1657,  58332,    646,
            264,   3738,   8180,    304,    825,  11699,     30, 151645,    198,
         151644,  77091,    198]])

In [8]:
tokenizer.decode(tokenized_chat[0])

'<|im_start|>system\nYou are a friendly chatbot who always responds in the style of a pirate<|im_end|>\n<|im_start|>user\nHow many helicopters can a human eat in one sitting?<|im_end|>\n<|im_start|>assistant\n'

In [3]:
#setup_chat_format is used to create chat template with [{role:xx, content:xx}] data, to convert it to purely text
#we dont need to setup_chat_format for qwen. it has been setup
# from transformers import AutoTokenizer, AutoModelForCausalLM
# from trl import SFTConfig, SFTTrainer
# from trl import setup_chat_format
# model_name = "Qwen/Qwen2.5-1.5B"
# tokenizer = AutoTokenizer.from_pretrained(model_name)

# model = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto")
# model, tokenizer = setup_chat_format(model, tokenizer)
# tokenizer.decode(tokenizer.eos_token_id)
print(tokenizer.additional_special_tokens)

['<|im_start|>', '<|im_end|>', '<|object_ref_start|>', '<|object_ref_end|>', '<|box_start|>', '<|box_end|>', '<|quad_start|>', '<|quad_end|>', '<|vision_start|>', '<|vision_end|>', '<|vision_pad|>', '<|image_pad|>', '<|video_pad|>']


In [5]:
tokenizer.model_max_length

131072