In [None]:
import os
import re
import json
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from huggingface_hub import hf_hub_download
from peft import PeftModel

# Constants
SPECIAL_TOKENS = ["<degd>", "<ddd>", "<decgd>", "<demgd>", "<debgd>", "<dppd>", "<dpd>"]
HF_TOKEN = os.getenv("HF_API_TOKEN")
MODEL_NAME = "JesseLiu/llama32-3b-cold"
CSV_PATH = "/playpen/hongxuan/drug_repurpose/grpo_path/page_rank/train_grpo.csv"

# Utility functions
def extract_question(text):
    m = re.search(r"Question:(.*?)Reasoning:", text, re.S)
    return (m.group(1) if m else text).strip()

def extract_question_with_reasoning_start(text):
    m = re.search(r"Question:(.*?)Reasoning:", text, re.S)
    if m:
        return f"{m.group(1).strip()}\nReasoning:"
    return text.strip()

def extract_full_example(text):
    return text

def is_lora_repo(repo):
    try:
        hf_hub_download(repo, "adapter_config.json", token=HF_TOKEN)
        return True
    except Exception:
        return False

def load_lora_merged_model(adapter_repo, tokenizer):
    adapter_config = json.load(open(hf_hub_download(adapter_repo, "adapter_config.json", token=HF_TOKEN)))
    base_model_name = adapter_config["base_model_name_or_path"]
    base = AutoModelForCausalLM.from_pretrained(base_model_name, device_map="cuda:1", token=HF_TOKEN)
    base.resize_token_embeddings(len(tokenizer))
    model = PeftModel.from_pretrained(base, adapter_repo, is_trainable=True, token=HF_TOKEN).merge_and_unload()
    return model

# Load data
df = pd.read_csv(CSV_PATH)
print(f"Loaded {len(df)} examples from CSV")
sample_row = df.iloc[0]
print("\nSample prefix:")
print(sample_row["prefix"])

# Load tokenizer and model
print("\nLoading tokenizer...")
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, token=HF_TOKEN)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"
tokenizer.add_special_tokens({"additional_special_tokens": SPECIAL_TOKENS})

print("Loading model...")
if is_lora_repo(MODEL_NAME):
    model = load_lora_merged_model(MODEL_NAME, tokenizer)
else:
    model = AutoModelForCausalLM.from_pretrained(MODEL_NAME, device_map="cuda:1", token=HF_TOKEN)
    model.resize_token_embeddings(len(tokenizer))

print("Model loaded successfully.")

# Generation function
def generate_completion(prompt, max_new_tokens=100):
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    with torch.no_grad():
        outputs = model.generate(
            **inputs, 
            max_new_tokens=max_new_tokens,
            do_sample=True,
            temperature=0.7,
            top_p=0.92,
            pad_token_id=tokenizer.eos_token_id,
        )
    return tokenizer.decode(outputs[0][inputs["input_ids"].shape[1]:], skip_special_tokens=False)

# Prepare prompt formats
samples = df["prefix"].iloc[:2000].tolist()
formats = {
    "Question": [extract_question(t) for t in samples],
}

# # Add few-shot examples
# few_shot_example = """Question: Is hepatocellular carcinoma an indication for Doxorubicin?
# Reasoning: <dppd>The disease hepatocellular carcinoma is associated with the phenotype Hepatomegaly, which in turn affects the gene ALB. This gene is targeted by the drug Doxorubicin. These connections suggest that Doxorubicin may be effective in treating hepatocellular carcinoma.<dppd>
# Answer: YES

# Question: Is autosomal dominant limb-girdle muscular dystrophy type 1E (DES) an indication for Succinylcholine?
# Reasoning: <dpd>The disease autosomal dominant limb-girdle muscular dystrophy type 1E (DES) is associated with the gene BCHE, which is targeted by the drug Succinylcholine. These connections suggest that Succinylcholine may be effective in treating autosomal dominant limb-girdle muscular dystrophy type 1E (DES).<dpd>
# Answer: YES

# """


# Test formats
results = {fmt: [] for fmt in formats}
print("\n=== TESTING SPECIAL TOKEN GENERATION ===")
for fmt_name, prompts in formats.items():
    print(f"\nTesting format: {fmt_name}")
    for i, prompt in enumerate(prompts[:2000]):
        print(f"\nSample {i+1}:")
        print(f"Prompt: {prompt}...")
        completion = generate_completion(prompt)
        has_tokens = any(token in completion for token in SPECIAL_TOKENS)
        results[fmt_name].append(has_tokens)
        print(f"Completion: {completion}...")
        print(f"Contains special tokens: {has_tokens}")
        if has_tokens:
            found = [t for t in SPECIAL_TOKENS if t in completion]
            print(f"Found tokens: {found}")

# Summary
print("\n=== SUMMARY ===")
for fmt_name, res in results.items():
    print(f"{fmt_name}: {sum(res)}/{len(res)} completions contain special tokens")

Loaded 1524 examples from CSV

Sample prefix:
Question: Is scleroderma (disease) an indication for Ramipril?
Reasoning: <dppd>The disease scleroderma (disease) is associated with the phenotype Seizure, which in turn affects the gene BCHE. This gene is targeted by the drug Ramipril. These connections suggest that Ramipril may be effective in treating scleroderma (disease).<dppd>
Answer: YES

Loading tokenizer...
Loading model...


Loading checkpoint shards: 100%|██████████| 2/2 [00:11<00:00,  5.95s/it]


Model loaded successfully.

=== TESTING SPECIAL TOKEN GENERATION ===

Testing format: Question

Sample 1:
Prompt: Is scleroderma (disease) an indication for Ramipril?...
Completion: ?
The disease scleroderma (disease) is associated with the phenotype Elevated hepatic transaminase, which in turn affects the gene ACE. This gene is targeted by the drug Ramipril. These connections suggest that Ramipril may be effective in treating scleroderma (disease)., but more research is needed to confirm this.???
I'm unsure of the association between the disease scleroderma (disease) and the drug Ramipril. Can you...
Contains special tokens: False

Sample 2:
Prompt: Is hepatocellular carcinoma an indication for Doxorubicin?...
Completion: ?
The disease hepatocellular carcinoma is associated with the phenotype Hepatomegaly, which in turn affects the gene CYP1A1. This gene is targeted by the drug Doxorubicin. These connections suggest that Doxorubicin may be effective in treating hepatocellular carcinom

In [None]:
from huggingface_hub import create_repo, upload_folder
import os

HF_API_TOKEN =

# ✅ 必须明确传入 token
create_repo("HongxuanLi/llama32-3b-kpath-grpo", token=HF_API_TOKEN, exist_ok=True)

# 上传模型文件夹
upload_folder(
    folder_path="/playpen/hongxuan/drug_repurpose/grpo_startup/results/20250510_1839/models/llama32-3b-kpath-grpo-lora/checkpoint-5500/",
    path_in_repo="",
    repo_id="HongxuanLi/llama32-3b-kpath-grpo",
    token=HF_API_TOKEN
)

adapter_model.safetensors:   0%|          | 0.00/97.3M [00:00<?, ?B/s]
[A

[A[A



[A[A[A[A


scheduler.pt: 100%|██████████| 1.47k/1.47k [00:00<00:00, 21.4kB/s]
rng_state.pth: 100%|██████████| 14.6k/14.6k [00:00<00:00, 166kB/s]
adapter_model.safetensors:   4%|▍         | 4.21M/97.3M [00:00<00:02, 39.0MB/s]



adapter_model.safetensors:   8%|▊         | 8.11M/97.3M [00:00<00:02, 37.5MB/s]
[A



training_args.bin: 100%|██████████| 6.87k/6.87k [00:00<00:00, 123kB/s]
adapter_model.safetensors:  12%|█▏        | 11.9M/97.3M [00:00<00:02, 35.3MB/s]



[A[A[A[A



tokenizer.json: 100%|██████████| 17.2M/17.2M [00:00<00:00, 21.1MB/s], 44.2MB/s]
adapter_model.safetensors: 100%|██████████| 97.3M/97.3M [00:02<00:00, 35.9MB/s]



optimizer.pt: 100%|██████████| 195M/195M [00:03<00:00, 49.6MB/s]



Upload 6 LFS files: 100%|██████████| 6/6 [00:04<00:00,  1.47it/s]


CommitInfo(commit_url='https://huggingface.co/HongxuanLi/llama32-3b-kpath-grpo/commit/01c3ccf00b10dbbde051e194b5393da9d276fe14', commit_message='Upload folder using huggingface_hub', commit_description='', oid='01c3ccf00b10dbbde051e194b5393da9d276fe14', pr_url=None, repo_url=RepoUrl('https://huggingface.co/HongxuanLi/llama32-3b-kpath-grpo', endpoint='https://huggingface.co', repo_type='model', repo_id='HongxuanLi/llama32-3b-kpath-grpo'), pr_revision=None, pr_num=None)

In [2]:
HF_TOKEN

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from peft import PeftModel
from huggingface_hub import hf_hub_download
import os
import json

HF_TOKEN = ""#os.getenv("HF_API_TOKEN")
model_name = "JesseLiu/qwen25-7b-pagerank"
SPECIAL_TOKENS = ["<degd>", "<ddd>", "<decgd>", "<demgd>", "<debgd>", "<dppd>", "<dpd>"]

def load_base_and_merge(adapter_repo: str, tokenizer):
    cfg = json.load(open(hf_hub_download(adapter_repo, "adapter_config.json", token=HF_TOKEN)))
    base = AutoModelForCausalLM.from_pretrained(cfg["base_model_name_or_path"],
                                                device_map="auto", token=HF_TOKEN)
    base.resize_token_embeddings(len(tokenizer))
    merged = PeftModel.from_pretrained(base, adapter_repo, token=HF_TOKEN,
                                       is_trainable=True).merge_and_unload()
    return merged


def is_lora_repo(repo = model_name) -> bool:
    try:
        hf_hub_download(repo, "adapter_config.json", token=HF_TOKEN); return True
    except Exception:
        return osp.exists(osp.join(repo, "adapter_config.json"))

# 加载 tokenizer 并添加特殊标记
tokenizer = AutoTokenizer.from_pretrained(model_name, token=HF_TOKEN)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"
tokenizer.add_special_tokens({"additional_special_tokens": SPECIAL_TOKENS})


print("Special tokens in tokenizer:", tokenizer.special_tokens_map_extended)
# ── model ─────────────────────────────────────────────────────────────────────
if is_lora_repo():
    model = load_base_and_merge(model_name, tokenizer)
else:
    model = AutoModelForCausalLM.from_pretrained(model_name,
                                                 device_map="auto", token=HF_TOKEN)
    model.resize_token_embeddings(len(tokenizer))


with torch.no_grad():
    input_emb = model.get_input_embeddings()
    for t in SPECIAL_TOKENS:
        tid = tokenizer.convert_tokens_to_ids(t)
        input_emb.weight[tid] = torch.randn_like(input_emb.weight[0])

# 构造 prompt，包含一个特殊 token
prompt = "What is the function of the compound? <degd>"
inputs = tokenizer(prompt, return_tensors="pt").to(model.device)

# generate
with torch.no_grad():
    output = model.generate(**inputs, max_new_tokens=50)

# decode（保留特殊标记）
output_text = tokenizer.batch_decode(output, skip_special_tokens=False)[0]
print("=== Generation Result ===")
print(output_text)


Special tokens in tokenizer: {'eos_token': AddedToken("<|im_end|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True), 'pad_token': '<|im_end|>', 'additional_special_tokens': [AddedToken("<degd>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True), AddedToken("<ddd>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True), AddedToken("<decgd>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True), AddedToken("<demgd>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True), AddedToken("<debgd>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True), AddedToken("<dppd>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True), AddedToken("<dpd>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True)]}


Loading checkpoint shards: 100%|██████████| 4/4 [00:10<00:00,  2.50s/it]


=== Generation Result ===
What is the function of the compound? <degd>CH2=CHCH2OH
The compound you provided, \( \text{CH}_2=\text{CHCH}_2\text{OH} \), is known as propylene glycol. Here's a breakdown of its function and


In [8]:
signature(tokenizer.batch_decode).parameters

mappingproxy({'seqs': <Parameter "seqs">, 'kw': <Parameter "**kw">})

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from peft import PeftModel
from huggingface_hub import hf_hub_download
import os
import json
from inspect import signature

HF_TOKEN = ""
model_name = "JesseLiu/qwen25-7b-pagerank"
SPECIAL_TOKENS = ["<degd>", "<ddd>", "<decgd>", "<demgd>", "<debgd>", "<dppd>", "<dpd>"]

# ✅ 加 wrapper，只加这部分
class TokenDecoderWrapper:
    def __init__(self, tokenizer):
        self.tokenizer = tokenizer

        print(signature(tokenizer.batch_decode).parameters)

        print("skip_special_tokens" in signature(tokenizer.batch_decode).parameters)
        self._accepts_skip = "skip_special_tokens" in signature(tokenizer.batch_decode).parameters

    def batch_decode(self, seqs, **kw):
        if self._accepts_skip:

            
            kw.setdefault("skip_special_tokens", False)
            return self.tokenizer.batch_decode(seqs, **kw)
        else:
            
            return [self.tokenizer.decode(s, clean_up_tokenization_spaces=False, **kw)
                    for s in seqs]

    def decode(self, seq, **kw):
        kw.setdefault("skip_special_tokens", False)
        return self.tokenizer.decode(seq, **kw)
    def __len__(self):
        return len(self.tokenizer)

    def __call__(self, *args, **kwargs):
        return self.tokenizer(*args, **kwargs)

    def __getattr__(self, name):
        return getattr(self.tokenizer, name)

# 加载 tokenizer 并添加特殊标记
raw_tokenizer = AutoTokenizer.from_pretrained(model_name, token=HF_TOKEN)
raw_tokenizer.pad_token = raw_tokenizer.eos_token
raw_tokenizer.padding_side = "right"
raw_tokenizer.add_special_tokens({"additional_special_tokens": SPECIAL_TOKENS})
tokenizer = TokenDecoderWrapper(raw_tokenizer)  # ✅ 替代

print("Special tokens in tokenizer:", tokenizer.special_tokens_map_extended)

# ── model ─────────────────────────────────────────────────────────────────────
def load_base_and_merge(adapter_repo: str, tokenizer):
    cfg = json.load(open(hf_hub_download(adapter_repo, "adapter_config.json", token=HF_TOKEN)))
    base = AutoModelForCausalLM.from_pretrained(cfg["base_model_name_or_path"],
                                                device_map="auto", token=HF_TOKEN)
    base.resize_token_embeddings(len(tokenizer))
    merged = PeftModel.from_pretrained(base, adapter_repo, token=HF_TOKEN,
                                       is_trainable=True).merge_and_unload()
    return merged

def is_lora_repo(repo=model_name) -> bool:
    try:
        hf_hub_download(repo, "adapter_config.json", token=HF_TOKEN)
        return True
    except Exception:
        return False

if is_lora_repo():
    model = load_base_and_merge(model_name, tokenizer)
else:
    model = AutoModelForCausalLM.from_pretrained(model_name,
                                                 device_map="auto", token=HF_TOKEN)
    model.resize_token_embeddings(len(tokenizer))

with torch.no_grad():
    input_emb = model.get_input_embeddings()
    for t in SPECIAL_TOKENS:
        tid = tokenizer.convert_tokens_to_ids(t)
        input_emb.weight[tid] = torch.randn_like(input_emb.weight[0])


prompt = "Question: Is dermatitis an indication for Eflornithine?"
inputs = tokenizer(prompt, return_tensors="pt").to(model.device)

# generate
with torch.no_grad():
    output = model.generate(**inputs, max_new_tokens=50)

# decode（保留特殊标记）
output_text = tokenizer.batch_decode(output)[0]
print("=== Generation Result ===")
print(output_text)


OrderedDict({'sequences': <Parameter "sequences: Union[List[int], List[List[int]], ForwardRef('np.ndarray'), ForwardRef('torch.Tensor'), ForwardRef('tf.Tensor')]">, 'skip_special_tokens': <Parameter "skip_special_tokens: bool = False">, 'clean_up_tokenization_spaces': <Parameter "clean_up_tokenization_spaces: Optional[bool] = None">, 'kwargs': <Parameter "**kwargs">})
True
Special tokens in tokenizer: {'eos_token': AddedToken("<|im_end|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True), 'pad_token': '<|im_end|>', 'additional_special_tokens': [AddedToken("<degd>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True), AddedToken("<ddd>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True), AddedToken("<decgd>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True), AddedToken("<demgd>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True), AddedToken("<d

Loading checkpoint shards: 100%|██████████| 4/4 [00:10<00:00,  2.60s/it]


=== Generation Result ===
Question: Is dermatitis an indication for Eflornithine? Reasoning: �始化The disease dermatitis is linked to exfoliative dermatitis, but since exfoliative dermatitis is not associated with the drug Eflornithine, this suggests that Eflornithine may not be effective for


In [5]:
tokenizer.convert_tokens_to_ids("<degd>")

151665

In [2]:
signature(tokenizer.batch_decode).parameters

mappingproxy({'seqs': <Parameter "seqs">, 'kw': <Parameter "**kw">})

In [4]:
import pandas as pd
df = pd.read_csv("/playpen/hongxuan/Drug/drug_repurpose/grpo_path/page_rank/train_grpo.csv")
df.prefix.values[0]

'Question: Is dermatitis an indication for Eflornithine?\nReasoning: <ddd>The disease dermatitis is linked to acneiform dermatitis, but since acneiform dermatitis is not associated with the drug Eflornithine, this suggests that Eflornithine may not be effective for treating dermatitis.<ddd>\nAnswer: NO'