In [8]:
from transformers import AutoTokenizer, AutoModelForCausalLM
tokenizer = AutoTokenizer.from_pretrained("/FineMedLM-o1")

In [1]:
from datasets import load_dataset

dataset = load_dataset("/inspire/hdd/ws-ba572160-47f8-4ca1-984e-d6bcdeb95dbb/a100-maybe/public/yhz/FineMed-DPO")
dataset

  from .autonotebook import tqdm as notebook_tqdm


DatasetDict({
    train: Dataset({
        features: ['text', 'complexity', 'quality', 'language', 'prompt', 'chosen', 'rejected'],
        num_rows: 32919
    })
})

In [3]:
dataset = dataset['train']
dataset[0]

{'text': 'Arthur W. English, PhD, is Professor of Cell Biology and Rehabilitation Medicine at the School of Medicine, Emory University. At Emory, he concentrates on investigating recovery from nerve injury from multiple perspectives. On one level, he investigates methods to enhance axon regeneration post-injury, advancing our understanding of the interactions between neuronal activity, gonadal hormones, and neurotrophins. On another level, Dr. English is exploring spinal cord nerve synapse plasticity following nerve injury, with particular attention to the role of axonal protein synthesis. On a third level, Dr. English researches the effects of exercise on post-injury functional nerve recovery – this program is funded by the National Center for Medical Rehabilitation Research at the National Institute of Child Health and Human Development. Dr. English received his Ph.D. in Neuroscience at the University of Illinois.\nArthur W. English, PhD\nProfessor of Cell Biology and Rehabilitation 

In [6]:
# FineMedLM-o1
def convert_to_messages_format(example):
    prompt = [{'content': """You are a helpful professional doctor. You need to generate an answer based on the given problem and thoroughly explore the problem through a systematic and long-term thinking process to provide a final and accurate solution. This requires a comprehensive cycle of analysis, summary, exploration, re-evaluation, reflection, backtracking and iteration to form a thoughtful thinking process. Use the background information provided in the text to assist in formulating the answer. Follow these answer guidelines:
1. Please structure your response into two main sections: **Thought** and **Summarization**.
2. During the **Thought** phase, think step by step based on the given text content. If the text content is used, it must be expressed.
3. During the **Summarization** phase, based on the thinking process in the thinking phase, give the final answer to the question.
Here is the question: """,
    'role': 'system'}]    
    for p in example["prompt"]:
        prompt.append(p)
    example["prompt"] = prompt
    return example

# 转换结果
converted_data = dataset.map(convert_to_messages_format, remove_columns=["text", 'complexity', 'quality', 'language'])
converted_data

Dataset({
    features: ['prompt', 'chosen', 'rejected'],
    num_rows: 32919
})

In [9]:
# DPO
from tqdm import tqdm

def AddChat(example):
    if "prompt" in example:
            last_role = example["prompt"][-1]["role"]
            if last_role == "user":
                add_generation_prompt = True
                continue_final_message = False
            elif last_role == "assistant":
                add_generation_prompt = False
                continue_final_message = True
            else:
                raise ValueError(f"Invalid role in the last message: {last_role}")
            prompt = tokenizer.apply_chat_template(
                example["prompt"],
                continue_final_message=continue_final_message,
                tokenize=False,
                add_generation_prompt=add_generation_prompt,
            )

    # Apply the chat template to the entire prompt + completion
    if "prompt" in example:  # explicit prompt and prompt-completion case
        if "chosen" in example:
            prompt_chosen = tokenizer.apply_chat_template(
                example["prompt"] + example["chosen"], tokenize=False
            )
            chosen = prompt_chosen[len(prompt) :]
        if "rejected" in example and "prompt" in example:  # explicit prompt
            prompt_rejected = tokenizer.apply_chat_template(
                example["prompt"] + example["rejected"], tokenize=False
            )
            rejected = prompt_rejected[len(prompt) :]
        if "completion" in example:
            prompt_completion = tokenizer.apply_chat_template(
                example["prompt"] + example["completion"], tokenize=False
            )
            completion = prompt_completion[len(prompt) :]
    else:  # implicit prompt case
        if "chosen" in example:
            chosen = tokenizer.apply_chat_template(example["chosen"], tokenize=False)
        if "rejected" in example:
            rejected = tokenizer.apply_chat_template(example["rejected"], tokenize=False)

    # Ensure that the prompt is the initial part of the prompt-completion string
    if "prompt" in example:
        error_message = (
            "The chat template applied to the prompt + completion does not start with the chat template applied to "
            "the prompt alone. This can indicate that the chat template is not supported by TRL."
            "\n**Prompt**:\n{}\n\n**Prompt + Completion**:\n{}"
        )
        if "chosen" in example and not prompt_chosen.startswith(prompt):
            raise ValueError(error_message.format(prompt, prompt_chosen))
        if "rejected" in example and not prompt_rejected.startswith(prompt):
            raise ValueError(error_message.format(prompt, prompt_rejected))
        if "completion" in example and not prompt_completion.startswith(prompt):
            raise ValueError(error_message.format(prompt, prompt_completion))

    # Extract the completion by removing the prompt part from the prompt-completion string
    output = {}
    if "prompt" in example:
        output["prompt"] = prompt
    if "chosen" in example:
        output["chosen"] = chosen
    if "rejected" in example:
        output["rejected"] = rejected
    if "completion" in example:
        output["completion"] = completion
    if "label" in example:
        output["label"] = example["label"]
    return output

messages = []
for example in tqdm(converted_data):
    messages.append(AddChat(example))
len(messages)

100%|██████████| 32919/32919 [00:06<00:00, 5323.90it/s]


32919

In [11]:
texts = []
for text in messages:
    texts.append(text['prompt'] + text['chosen'])
texts[0]

"<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\nCutting Knowledge Date: December 2023\nToday Date: 26 Jul 2024\n\nYou are a helpful professional doctor. You need to generate an answer based on the given problem and thoroughly explore the problem through a systematic and long-term thinking process to provide a final and accurate solution. This requires a comprehensive cycle of analysis, summary, exploration, re-evaluation, reflection, backtracking and iteration to form a thoughtful thinking process. Use the background information provided in the text to assist in formulating the answer. Follow these answer guidelines:\n1. Please structure your response into two main sections: **Thought** and **Summarization**.\n2. During the **Thought** phase, think step by step based on the given text content. If the text content is used, it must be expressed.\n3. During the **Summarization** phase, based on the thinking process in the thinking phase, give the final answer to the questi

In [12]:
import json

# 保存为 jsonl 文件
with open("/tttdata.jsonl", "w", encoding="utf-8") as f:
    for line in texts:
        json_line = json.dumps({"text": line}, ensure_ascii=False)
        f.write(json_line + "\n")

In [13]:
import torch
import json

from tqdm import tqdm
from transformers import AutoTokenizer, AutoModel
from pathlib import Path

# 配置模型
embedding_model_name = "/inspire/hdd/ws-ba572160-47f8-4ca1-984e-d6bcdeb95dbb/a100-maybe/public/yhz/embedding_model"
tokenizer = AutoTokenizer.from_pretrained(embedding_model_name)
model = AutoModel.from_pretrained(embedding_model_name)
model.eval()
model.cuda()  # 如果有 GPU

def get_embedding(text):
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True)
    inputs = {k: v.cuda() for k, v in inputs.items()}
    with torch.no_grad():
        output = model(**inputs).last_hidden_state.mean(dim=1)
    return output.squeeze(0).cpu()  # 返回 (D,)

# 读取 jsonl 文件
jsonl_path = "/tttdata.jsonl"
texts = []
embeddings = []

with open(jsonl_path, "r", encoding="utf-8") as f:
    for line in tqdm(f, desc="Processing"):
        item = json.loads(line)
        text = item.get("text", "").strip()
        if not text:
            continue
        texts.append(text)
        emb = get_embedding(text)
        embeddings.append(emb)

# 转为 tensor 格式
embeddings_tensor = torch.stack(embeddings)  # (N, D)

# 保存
torch.save(embeddings_tensor, "/finemed_embeddings.pt")
torch.save(texts, "/finemed_texts.pt")

print("✅ 嵌入和文本已保存完毕。")


Processing: 32919it [18:19, 29.93it/s]


✅ 嵌入和文本已保存完毕。
