In [1]:
from transformers import AutoTokenizer
from v4.data.template import get_template_and_fix_tokenizer
from v4.hparams import DataArguments
from v4.data.converter import AlpacaDatasetConverter
from types import SimpleNamespace

# 1. 初始化 tokenizer（确保路径和你的模型一致）
tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-Math-1.5B-Instruct")

# 2. 指定使用的模板，比如 "chatml", "llama2", "qwen", 等等
data_args = DataArguments(template="deepseek")

# 3. 为 converter 构造一个简化版 DatasetAttr（只要字段对应上即可）
dataset_attr = SimpleNamespace(
    prompt="instruction",
    query="input",
    response="output",
    history=None,
    kto_tag=None,
    ranking=False,
    chosen=None,
    rejected=None,
    system=None,
    tools=None,
    images=None,
    videos=None,
    audios=None,
    load_from="file",
    formatting="alpaca",
)

# DataArguments 这里只为 media_dir 占位
converter = AlpacaDatasetConverter(dataset_attr=dataset_attr, data_args=data_args)

  from .autonotebook import tqdm as notebook_tqdm


In [10]:
import pprint

# 你的原始样本
sample = {
    "instruction": "Transform the following sentence using a synonym: The car sped quickly.",
    "input": "",
    "output": "The car accelerated rapidly."
}

# 执行转换
converted = converter(sample)

print("---- 转换后格式（converted） ----")
pprint.pprint(converted)

# 3. 获取模板并修复 tokenizer 的特殊 token
template = get_template_and_fix_tokenizer(tokenizer, data_args)

# converted["_prompt"] 是一个列表，列表里每两个元素为 user/assistant 交替
prompt_msgs = converted["_prompt"]
response_msgs = converted["_response"]

# 合并成 messages
messages = prompt_msgs + response_msgs

print("---- messages ----")
pprint.pprint(messages)

# encode_oneturn 会把 messages 里的 user/assistant 按 template 转成 token_ids
prompt_ids, response_ids = template.encode_oneturn(tokenizer, messages)

print("\n✅ 编码完成！")

---- 转换后格式（converted） ----
{'_audios': None,
 '_images': None,
 '_prompt': [{'content': 'Transform the following sentence using a synonym: '
                         'The car sped quickly.',
              'role': 'user'}],
 '_response': [{'content': 'The car accelerated rapidly.',
                'role': 'assistant'}],
 '_system': '',
 '_tools': '',
 '_videos': None}
---- messages ----
[{'content': 'Transform the following sentence using a synonym: The car sped '
             'quickly.',
  'role': 'user'},
 {'content': 'The car accelerated rapidly.', 'role': 'assistant'}]

✅ 编码完成！


In [9]:
# %% [code]
print("—— Prompt 解码 ——")
print(tokenizer.decode(prompt_ids, skip_special_tokens=False))

print("\n—— Response 解码 ——")
print(tokenizer.decode(response_ids, skip_special_tokens=False))


—— Prompt 解码 ——
User: Transform the following sentence using a synonym: The car sped quickly.

Assistant:

—— Response 解码 ——
The car accelerated rapidly.<|im_end|>
