In [1]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
import time

# 指定模型路径
model_path = "/inspire/hdd/ws-ba572160-47f8-4ca1-984e-d6bcdeb95dbb/a100-maybe/yuhongzhou/rebuttal/FineMedLM-o1"

# 加载模型和 tokenizer，并放到 GPU 上
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForCausalLM.from_pretrained(model_path).to("cuda:1")  # 或 torch.device("cuda")

  from .autonotebook import tqdm as notebook_tqdm
Loading checkpoint shards: 100%|██████████| 4/4 [00:15<00:00,  3.82s/it]


In [2]:
tokenizer.pad_token, tokenizer.eos_token

('<|eot_id|>', '<|eot_id|>')

In [2]:
# 构造输入
prompt = (
    """The following are multiple choice questions (with answers) about health. Think step by step and then finish your answer with "the answer is (X)" where X is the correct letter choice.


Question:
Polio can be eradicated by which of the following?
Options:
A. Herbal remedies
B. Use of antibiotics
C. Regular intake of vitamins
D. Administration of tetanus vaccine
E. Attention to sewage control and hygiene
F. Natural immunity acquired through exposure
G. Use of antiviral drugs
Answer: Let's think step by step.
"""
)
messages = [
    {"role": "system", "content": """You are a helpful professional doctor. You need to generate an answer based on the given problem and thoroughly explore the problem through a systematic and long-term thinking process to provide a final and accurate solution. This requires a comprehensive cycle of analysis, summary, exploration, re-evaluation, reflection, backtracking and iteration to form a thoughtful thinking process. Use the background information provided in the text to assist in formulating the answer. Follow these answer guidelines:
1. Please structure your response into two main sections: **Thought** and **Summarization**.
2. During the **Thought** phase, think step by step based on the given text content. If the text content is used, it must be expressed.
3. During the **Summarization** phase, based on the thinking process in the thinking phase, give the final answer to the question.
Here is the question: """},
    {"role": "user", "content": prompt}
]
text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
print(text)

model_inputs = tokenizer([text], return_tensors="pt").to("cuda:1")  # 把输入也放到 GPU

# 推理并计时
print("-----start generate-----")
start_time = time.time()
generated_ids = model.generate(
    model_inputs.input_ids,
    max_new_tokens=2048,
    eos_token_id=tokenizer.eos_token_id
)
end_time = time.time()
print(f"Generation took {end_time - start_time:.2f} seconds.")

# 解码输出
generated_text = tokenizer.batch_decode(
    generated_ids[:, model_inputs.input_ids.shape[-1]:],
    skip_special_tokens=False
)[0]
print(generated_text)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


<|begin_of_text|><|start_header_id|>system<|end_header_id|>

Cutting Knowledge Date: December 2023
Today Date: 26 Jul 2024

You are a helpful professional doctor. You need to generate an answer based on the given problem and thoroughly explore the problem through a systematic and long-term thinking process to provide a final and accurate solution. This requires a comprehensive cycle of analysis, summary, exploration, re-evaluation, reflection, backtracking and iteration to form a thoughtful thinking process. Use the background information provided in the text to assist in formulating the answer. Follow these answer guidelines:
1. Please structure your response into two main sections: **Thought** and **Summarization**.
2. During the **Thought** phase, think step by step based on the given text content. If the text content is used, it must be expressed.
3. During the **Summarization** phase, based on the thinking process in the thinking phase, give the final answer to the question.
Here 

In [15]:
from datasets import load_dataset

# 读取 jsonl 文件
dataset = load_dataset("json", data_files="/inspire/hdd/ws-ba572160-47f8-4ca1-984e-d6bcdeb95dbb/a100-maybe/yuhongzhou/rebuttal/Endocrinology.jsonl", split="train")

# 打印部分数据
dataset[0], dataset

Generating train split: 11477 examples [00:00, 78284.80 examples/s]


({'text': 'IDEAL PROTEIN WEIGHT LOSS METHOD\nA Medically Developed Weight Loss Method\nWith a Beginning, Middle and End.\nOur Weight Loss Method is a medically designed protocol containing 2 key components – weight loss and a healthier lifestyle education to assist you in maintaining your results after dieting. Our protocol has evolved for over 20 years, but was originally developed over 2 decades ago by Dr. Tran Tien Chanh who focused his career and research on nutrition with a particular emphasis on the treatment of obesity and obesity related issues.\nThis protocol is the recommended weight loss method in over 3,000 Professional Establishments worldwide!\nUnderstanding The Cause of Weight Gain Will Help You Conquer it\nAn overproduction of insulin may lead to hypoglycemia, or low blood sugar, which in turn may induce constant sugar cravings and weight gain. One of Insulin’s primary functions is to regulate blood sugar levels, however, it is also the hormone that facilitates the tran

In [17]:
dataset = ds['train']
dataset[0], dataset

({'text': 'Arthur W. English, PhD, is Professor of Cell Biology and Rehabilitation Medicine at the School of Medicine, Emory University. At Emory, he concentrates on investigating recovery from nerve injury from multiple perspectives. On one level, he investigates methods to enhance axon regeneration post-injury, advancing our understanding of the interactions between neuronal activity, gonadal hormones, and neurotrophins. On another level, Dr. English is exploring spinal cord nerve synapse plasticity following nerve injury, with particular attention to the role of axonal protein synthesis. On a third level, Dr. English researches the effects of exercise on post-injury functional nerve recovery – this program is funded by the National Center for Medical Rehabilitation Research at the National Institute of Child Health and Human Development. Dr. English received his Ph.D. in Neuroscience at the University of Illinois.\nArthur W. English, PhD\nProfessor of Cell Biology and Rehabilitation

In [None]:
# FineMedLM
def convert_to_messages_format(data):
    return {
        "messages": [
            {
                "role": "system",
                "content": "You are a helpful professional doctor. The user will give you a medical question, and you should answer it in a professional way."
            },
            {
                "role": "user",
                "content": data["instruction"]
            },
            {
                "role": "assistant",
                "content": data["response"]
            }
        ]
    }

# 转换结果
converted_data = dataset.map(convert_to_messages_format, remove_columns=dataset.column_names)
converted_data

In [18]:
dataset[0]['prompt'][0]

{'content': 'How do the interactions between neuronal activity, gonadal hormones, and neurotrophins influence axon regeneration post-injury, and what are the potential therapeutic implications of this research? Please think step by step.',
 'role': 'user'}

In [23]:
# FineMedLM-o1
def convert_to_messages_format(example):
    prompt = [{'content': """You are a helpful professional doctor. You need to generate an answer based on the given problem and thoroughly explore the problem through a systematic and long-term thinking process to provide a final and accurate solution. This requires a comprehensive cycle of analysis, summary, exploration, re-evaluation, reflection, backtracking and iteration to form a thoughtful thinking process. Use the background information provided in the text to assist in formulating the answer. Follow these answer guidelines:
1. Please structure your response into two main sections: **Thought** and **Summarization**.
2. During the **Thought** phase, think step by step based on the given text content. If the text content is used, it must be expressed.
3. During the **Summarization** phase, based on the thinking process in the thinking phase, give the final answer to the question.
Here is the question: """,
    'role': 'system'}]    
    for p in example["prompt"]:
        prompt.append(p)
    example["prompt"] = prompt
    return example

# 转换结果
converted_data = dataset.map(convert_to_messages_format, remove_columns=["text", 'complexity', 'quality', 'language'])
converted_data

Map: 100%|██████████| 32919/32919 [00:02<00:00, 13052.68 examples/s]


Dataset({
    features: ['prompt', 'chosen', 'rejected'],
    num_rows: 32919
})

In [24]:
converted_data[0]

{'prompt': [{'content': 'You are a helpful professional doctor. You need to generate an answer based on the given problem and thoroughly explore the problem through a systematic and long-term thinking process to provide a final and accurate solution. This requires a comprehensive cycle of analysis, summary, exploration, re-evaluation, reflection, backtracking and iteration to form a thoughtful thinking process. Use the background information provided in the text to assist in formulating the answer. Follow these answer guidelines:\n1. Please structure your response into two main sections: **Thought** and **Summarization**.\n2. During the **Thought** phase, think step by step based on the given text content. If the text content is used, it must be expressed.\n3. During the **Summarization** phase, based on the thinking process in the thinking phase, give the final answer to the question.\nHere is the question: ',
   'role': 'system'},
  {'content': 'How do the interactions between neuron

In [13]:
converted_data = converted_data.select(range(12800))
converted_data

Dataset({
    features: ['messages'],
    num_rows: 12800
})

In [25]:
converted_data.save_to_disk("/inspire/hdd/ws-ba572160-47f8-4ca1-984e-d6bcdeb95dbb/a100-maybe/yuhongzhou/rebuttal/dpo2")

Saving the dataset (1/1 shards): 100%|██████████| 32919/32919 [00:00<00:00, 107245.78 examples/s]


In [33]:
from transformers import AutoTokenizer, AutoModelForCausalLM
tokenizer = AutoTokenizer.from_pretrained("/inspire/hdd/ws-ba572160-47f8-4ca1-984e-d6bcdeb95dbb/a100-maybe/yuhongzhou/rebuttal/llama8b")

In [27]:
tokenizer.pad_token

'<|eot_id|>'

In [21]:
tokenizer.save_pretrained("/inspire/hdd/ws-ba572160-47f8-4ca1-984e-d6bcdeb95dbb/a100-maybe/yuhongzhou/rebuttal/llama8b")

('/inspire/hdd/ws-ba572160-47f8-4ca1-984e-d6bcdeb95dbb/a100-maybe/yuhongzhou/rebuttal/llama8b/tokenizer_config.json',
 '/inspire/hdd/ws-ba572160-47f8-4ca1-984e-d6bcdeb95dbb/a100-maybe/yuhongzhou/rebuttal/llama8b/special_tokens_map.json',
 '/inspire/hdd/ws-ba572160-47f8-4ca1-984e-d6bcdeb95dbb/a100-maybe/yuhongzhou/rebuttal/llama8b/tokenizer.json')

In [81]:
from datasets import load_dataset
converted_data = load_dataset("json", data_files="/inspire/hdd/ws-ba572160-47f8-4ca1-984e-d6bcdeb95dbb/a100-maybe/yuhongzhou/rebuttal/OtherDepartments.jsonl", split="train")

Generating train split: 158629 examples [00:01, 153506.20 examples/s]


In [82]:
converted_data[0]

{'text': 'A Comprehensive Guide: How to Find a Therapist Near You. Cbt Therapist Near Me Nhs…\nSeeking treatment is a bold step toward much better mental health and wellness. Whether you’re having problem with stress and anxiety, anxiety, or any other mental concern, discovering the ideal therapist can make a substantial distinction in your healing journey. In this post, we will check out the procedure of finding a therapist near you, go over various kinds of therapists readily available, and offer recommendations for various psychological disorders.\nTypes of Therapists: Cbt Therapist Near Me Nhs\nPsychologists: These experts hold a postgraduate degree in psychology and focus on detecting and treating psychological health disorders. They frequently use evidence-based treatments such as cognitive-behavioral treatment (CBT) and psychodynamic therapy.\nPsychiatrists: Psychiatrists are medical physicians who focus on psychological health. They can diagnose mental illnesses, recommend medi

In [83]:
from tqdm import tqdm

quality = 0
complexity = 0
for data in tqdm(converted_data):
    quality += data["quality"]
    complexity += data["complexity"]
print(quality/len(converted_data))
print(complexity/len(converted_data))

100%|██████████| 158629/158629 [00:07<00:00, 20447.74it/s]

8.070668036739814
6.157808471338784





In [None]:
# SFT
from tqdm import tqdm
messages = []
for example in tqdm(converted_data):
    messages.append(tokenizer.apply_chat_template(example["messages"], tokenize=False))
len(messages)

100%|██████████| 12800/12800 [00:01<00:00, 11685.10it/s]


12800

In [37]:
# DPO
from tqdm import tqdm

def AddChat(example):
    if "prompt" in example:
            last_role = example["prompt"][-1]["role"]
            if last_role == "user":
                add_generation_prompt = True
                continue_final_message = False
            elif last_role == "assistant":
                add_generation_prompt = False
                continue_final_message = True
            else:
                raise ValueError(f"Invalid role in the last message: {last_role}")
            prompt = tokenizer.apply_chat_template(
                example["prompt"],
                continue_final_message=continue_final_message,
                tokenize=False,
                add_generation_prompt=add_generation_prompt,
            )

    # Apply the chat template to the entire prompt + completion
    if "prompt" in example:  # explicit prompt and prompt-completion case
        if "chosen" in example:
            prompt_chosen = tokenizer.apply_chat_template(
                example["prompt"] + example["chosen"], tokenize=False
            )
            chosen = prompt_chosen[len(prompt) :]
        if "rejected" in example and "prompt" in example:  # explicit prompt
            prompt_rejected = tokenizer.apply_chat_template(
                example["prompt"] + example["rejected"], tokenize=False
            )
            rejected = prompt_rejected[len(prompt) :]
        if "completion" in example:
            prompt_completion = tokenizer.apply_chat_template(
                example["prompt"] + example["completion"], tokenize=False
            )
            completion = prompt_completion[len(prompt) :]
    else:  # implicit prompt case
        if "chosen" in example:
            chosen = tokenizer.apply_chat_template(example["chosen"], tokenize=False)
        if "rejected" in example:
            rejected = tokenizer.apply_chat_template(example["rejected"], tokenize=False)

    # Ensure that the prompt is the initial part of the prompt-completion string
    if "prompt" in example:
        error_message = (
            "The chat template applied to the prompt + completion does not start with the chat template applied to "
            "the prompt alone. This can indicate that the chat template is not supported by TRL."
            "\n**Prompt**:\n{}\n\n**Prompt + Completion**:\n{}"
        )
        if "chosen" in example and not prompt_chosen.startswith(prompt):
            raise ValueError(error_message.format(prompt, prompt_chosen))
        if "rejected" in example and not prompt_rejected.startswith(prompt):
            raise ValueError(error_message.format(prompt, prompt_rejected))
        if "completion" in example and not prompt_completion.startswith(prompt):
            raise ValueError(error_message.format(prompt, prompt_completion))

    # Extract the completion by removing the prompt part from the prompt-completion string
    output = {}
    if "prompt" in example:
        output["prompt"] = prompt
    if "chosen" in example:
        output["chosen"] = chosen
    if "rejected" in example:
        output["rejected"] = rejected
    if "completion" in example:
        output["completion"] = completion
    if "label" in example:
        output["label"] = example["label"]
    return output

messages = []
for example in tqdm(converted_data):
    messages.append(AddChat(example))
len(messages)

100%|██████████| 32919/32919 [00:05<00:00, 5557.42it/s]


32919

In [38]:
messages[0]

{'prompt': '<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\nCutting Knowledge Date: December 2023\nToday Date: 26 Jul 2024\n\nYou are a helpful professional doctor. You need to generate an answer based on the given problem and thoroughly explore the problem through a systematic and long-term thinking process to provide a final and accurate solution. This requires a comprehensive cycle of analysis, summary, exploration, re-evaluation, reflection, backtracking and iteration to form a thoughtful thinking process. Use the background information provided in the text to assist in formulating the answer. Follow these answer guidelines:\n1. Please structure your response into two main sections: **Thought** and **Summarization**.\n2. During the **Thought** phase, think step by step based on the given text content. If the text content is used, it must be expressed.\n3. During the **Summarization** phase, based on the thinking process in the thinking phase, give the final answer to

- SFT  max_length -> 6064
- DPO  prompt  max_length -> 281
- DPO  chosen  max_length -> 5826
- 取 6144

In [40]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("/inspire/hdd/ws-ba572160-47f8-4ca1-984e-d6bcdeb95dbb/a100-maybe/yuhongzhou/rebuttal/llama8b")

# 编码并获取每个文本的 token 长度
lengths = [len(tokenizer.encode(text['chosen'], add_special_tokens=True)) for text in tqdm(messages)]

# 计算最大长度
max_length = max(lengths)

print(f"最大编码长度为: {max_length}")

100%|██████████| 32919/32919 [01:22<00:00, 397.22it/s]

最大编码长度为: 5826





In [50]:
count = 0
for length in lengths:
    if length > 1024:
        count += 1
count

8