In [1]:
# add .. path 
import os
import sys
sys.path.append('..')
import utils.llm_training as llm_training
import utils.llm_configs as llm_configs

import logging

# --- Basic Configuration ---
logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s - %(levelname)s - [%(name)s] - %(message)s",
    datefmt="%Y-%m-%d %H:%M:%S",
)
log = logging.getLogger(__name__)

os.environ["WANDB_PROJECT"]="medex_fine_tuning"


In [11]:
from datasets import load_dataset

# Login using e.g. `huggingface-cli login` to access this dataset
ds = load_dataset("medexanon/Medex")['train']

Resolving data files:   0%|          | 0/21 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/21 [00:00<?, ?it/s]

Loading dataset shards:   0%|          | 0/21 [00:00<?, ?it/s]

In [37]:
len(ds)

36308777

In [4]:
# === Cell 1: Configuration ===
model_config = llm_configs.ModelConfig(
    id="Qwen/Qwen2.5-0.5B",
    peft=llm_configs.PeftConfig(
        enabled=False,
        add_eot_token=False,  # No longer doing EOT token for LIMA
    ),
    quantization=llm_configs.QuantizationConfig(mode=None), # Use QLoRA
)

log.info("--- Configuration ---")
print(model_config.model_dump_json(indent=2))

log.info("\n--- Loading Model for Training ---")
model, tokenizer = llm_training.load_model_for_training(model_config, log)

2025-07-09 01:54:25 - INFO - [__main__] - --- Configuration ---
2025-07-09 01:54:25 - INFO - [__main__] - 
--- Loading Model for Training ---
2025-07-09 01:54:25 - INFO - [__main__] - Loading model 'Qwen/Qwen2.5-0.5B' for training...


{
  "id": "Qwen/Qwen2.5-0.5B",
  "torch_dtype": "auto",
  "attn_implementation": "flash_attention_2",
  "peft": {
    "enabled": false,
    "lora_r": 16,
    "lora_alpha": 32,
    "lora_dropout": 0.05,
    "target_modules": [
      "q_proj",
      "k_proj",
      "v_proj",
      "o_proj",
      "gate_proj",
      "up_proj",
      "down_proj"
    ],
    "add_eot_token": false
  },
  "quantization": {
    "mode": null
  }
}


2025-07-09 01:54:26 - INFO - [__main__] - Model and tokenizer loaded successfully.


In [36]:
def concat_columns(example, tokenizer):
    """
    Combine DOI/entity/fact/MolInfo/GeneInfo into one human-readable string.
    Empty or missing fields are omitted for that row.
    """

    chunks = []

    # 1) flat string columns
    if example.get("entity"):
        chunks.append(f"The following fact is for the entity '{example['entity']}'.")
    if example.get("fact"):
        chunks.append(f" {example['fact']}")

    # 2) MolInfo → [SMILES] …
    mol = example.get("MolInfo")
    if isinstance(mol, dict):
        smiles = mol.get("SMILES")
        if smiles:
            chunks.append(f"The SMILES string of this entity is '{smiles}'.")

    # # 3) GeneInfo → [GeneInfo] key: value, …
    # gene = example.get("GeneInfo")
    # if isinstance(gene, dict) and gene:
    #     def _fmt(key, val):
    #         return f'"{key}": {val}' if isinstance(val, int) else f'"{key}": "{val}"'
    #     fields = [_fmt(k, v) for k, v in gene.items() if v not in (None, "", [])]
    #     if fields:
    #         chunks.append(f"The NCBI Gene information of this entity is " + ", ".join(fields))
    #         print(f"The NCBI Gene information of this entity is " + ", ".join(fields))
    # join all parts with a single space
    return {"text": " ".join(chunks) + tokenizer.eos_token}

# ---- apply to your Dataset ----
# creates a new 'text' column, keeps the originals (remove_columns=[] by default)
ds_with_text = ds.map(concat_columns, fn_kwargs={"tokenizer": tokenizer},  desc="Building concatenated text")

Building concatenated text:   0%|          | 0/36308777 [00:00<?, ? examples/s]

KeyboardInterrupt: 

In [None]:
medex_ds = ds_with_text.select_columns(["text"])
medex_ds[90]

{'text': "The following fact is for the entity 'formic acid'.  Formic acid is used in combination with acetic acid and water to create a buffer at pH 2.1 for paper electrophoresis of peptides. The SMILES string of this entity is 'O=CO'.<|endoftext|>"}

In [None]:
lima_training_config = llm_configs.TrainingConfig(
    run_name = "36M samples on medex",
    num_train_epochs = 1,
    learning_rate  = 1e-5,
    logging_strategy = "steps", 
    logging_steps = 100,
    gradient_checkpointing=False,
    context_length = 1024,
    use_liger_kernel=True,
    per_device_train_batch_size = 16,
    gradient_accumulation_steps=16,
    # warmup_steps  = 0, # LIMA specifies no warmup, so we set this explicitly
    warmup_ratio = 0.3, # Use our default warmup ratio instead
    packing=True,
    padding_free = True,
    sequential_sampling = False,
    reverse_ffd_packing= False,
    remove_unused_columns=False,
)


# === Run LIMA Fine-Tuning ===
log.info("\n--- Starting LIMA Fine-Tuning ---")
# The model object will be updated with the fine-tuned weights
trainer = llm_training.sft_train_on_dataset(
    model=model,
    tokenizer=tokenizer,
    log=log,
    train_dataset=medex_ds,
    train_cfg=lima_training_config,
    train=False,
    use_liger_loss = True
)

2025-07-08 13:01:38 - INFO - [__main__] - 
--- Starting LIMA Fine-Tuning ---
2025-07-08 13:01:38 - INFO - [__main__] - Starting SFT training run...


False


2025-07-08 13:01:38 - INFO - [liger_kernel.transformers.monkey_patch] - Applying Liger kernels to model instance with model type: qwen2 with kwargs: {}


Applied Liger kernels to Qwen2


In [None]:
print(len(trainer.get_train_dataloader()))

4883


In [None]:
import wandb
trainer.train()
wandb.finish()

[34m[1mwandb[0m: Currently logged in as: [33mjiosephlee[0m ([33mupenn-ml[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Step,Training Loss
10,2.4915
20,2.4663
30,2.4792
40,2.5135
50,2.4424
60,2.5026
70,2.4348
80,2.4153
90,2.347
100,2.3396


[34m[1mwandb[0m: [32m[41mERROR[0m The nbformat package was not found. It is required to save notebook history.


0,1
train/epoch,▁▁▁▁▁▂▂▃▃▃▃▄▄▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▆▆▇▇▇▇▇▇▇▇██
train/global_step,▁▁▁▁▁▁▁▂▂▂▂▂▂▂▃▃▄▄▄▅▅▅▅▅▅▅▅▆▆▆▇▇▇▇▇▇▇███
train/grad_norm,█▁▂▁▂▂▁▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train/learning_rate,▁▁▂▂▃▃▃▄▄▄▅▅▆▆▇▇▇███████▇▇▇▆▅▅▄▄▄▃▃▂▂▂▁▁
train/loss,█▅▄▃▃▂▂▂▂▂▂▂▂▂▂▂▁▁▂▂▁▁▁▁▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train/num_tokens,▁▁▁▂▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▅▅▅▅▅▅▆▆▆▇▇▇▇▇▇██████

0,1
total_flos,1.9027773736143744e+17
train/epoch,1.0
train/global_step,4883.0
train/grad_norm,3.17188
train/learning_rate,0.0
train/loss,0.9105
train/num_tokens,88608895.0
train_loss,1.05668
train_runtime,5259.5589
train_samples_per_second,16.711


In [10]:
# git config --global user.email "jiosephlee@gmail.com"
# git config --global user.name "Joseph Lee"
inference_config = llm_configs.InferenceConfig(no_repeat_ngram_size=6, max_new_tokens=1024)
question = f"""Aluminum is"""
generated_text = llm_training.generate_text(model, tokenizer, question, inference_config)
print(generated_text)

Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.


Aluminum is a metal that can be used to create various products, including aluminum alloy. Aluminum alloy is a type of metal alloy that is commonly used in construction and manufacturing industries. It is a component of many products such as aluminum alloy doors, aluminum alloy windows, and aluminum alloy appliances. Aluminum alloy is also used in the production of aluminum foil, which is used in various applications such as inductors, capacitors, and in the manufacture of electronic devices. In addition, aluminum alloy is used in the production of various types of aircraft, including jet aircraft, which are used for long-distance travel and high-speed flight. Overall, aluminum alloy is a versatile and widely used metal alloy that has numerous applications in various industries.<|endoftext|>


In [11]:
# Save model before we LIMA tune
model.push_to_hub('jiosephlee/therapeutic_fine_tuning_750K')
tokenizer.push_to_hub('jiosephlee/therapeutic_fine_tuning_750K')

model.safetensors: 100%|██████████| 988M/988M [01:51<00:00, 8.84MB/s] 
tokenizer.json: 100%|██████████| 11.4M/11.4M [00:00<00:00, 13.2MB/s]


CommitInfo(commit_url='https://huggingface.co/jiosephlee/therapeutic_fine_tuning_750K/commit/2f3ad523a8622d50e9a2254e543702e4a4c4930c', commit_message='Upload tokenizer', commit_description='', oid='2f3ad523a8622d50e9a2254e543702e4a4c4930c', pr_url=None, repo_url=RepoUrl('https://huggingface.co/jiosephlee/therapeutic_fine_tuning_750K', endpoint='https://huggingface.co', repo_type='model', repo_id='jiosephlee/therapeutic_fine_tuning_750K'), pr_revision=None, pr_num=None)

In [2]:
# ── 3. Reload whenever you need them ───────────────────────────────────────
import pandas as pd
from pathlib import Path
# ── 1. Resolve the destination ──────────────────────────────────────────────
home_dir   = Path("~").expanduser()          # → /home/your-user  (Linux/Mac) or C:\Users\your-user (Windows)
data_dir   = home_dir / "ml_splits"          # keep everything in one tidy folder
data_dir.mkdir(exist_ok=True)                # create it if it doesn’t exist

train_path = data_dir / "train.parquet"
test_path  = data_dir / "test.parquet"
train_df = pd.read_parquet(train_path)
test_df  = pd.read_parquet(test_path)

print("Train shape:", train_df.shape)
print("Test  shape:", test_df.shape)

Train shape: (18041, 5)
Test  shape: (5154, 5)


In [3]:
import pandas as pd
from datasets import Dataset

# ---- 1.  Your starting DataFrame (df) ----
# df = pd.read_csv(...)   # or however you loaded it

# ---- 2.  Build the concatenated text for every row ----
def row_to_text(row):
    return (
        f"[Drug SMILE] {row['Drug']} "
        f"[Target] {row['Target_ID']} "
        f"[Binding Affinity] {row['Y']:}"
    )

train_df["text"] = train_df.apply(row_to_text, axis=1)

# ---- 3.  Keep only the 'text' column and convert to a Dataset ----
training_ds = Dataset.from_pandas(train_df[["text"]], preserve_index=False)

print(training_ds[0]["text"])
# '[Drug SMILE] Cc1[nH]nc2ccc(-c3cncc(OCC(N)Cc4ccccc4)c3)cc12 [Target] AAK1 [Binding Affinity] 7.37'

[Drug SMILE] Cc1[nH]nc2ccc(-c3cncc(OCC(N)Cc4ccccc4)c3)cc12 [Target] AAK1 [Binding Affinity] 7.3655227298392685


In [27]:
lima_training_config = llm_configs.TrainingConfig(
    run_name = "finetuning on TDC DAVIS",
    num_train_epochs = 1,
    learning_rate  = 2e-5,
    logging_strategy = "steps", 
    logging_steps = 10,
    gradient_checkpointing=False,
    context_length = 1024,
    use_liger_kernel=True,
    per_device_train_batch_size = 16,
    gradient_accumulation_steps=1,
    # warmup_steps  = 0, # LIMA specifies no warmup, so we set this explicitly
    warmup_ratio = 0.3, # Use our default warmup ratio instead
    packing=True,
    padding_free = True,
    sequential_sampling = False,
    reverse_ffd_packing= False,
    remove_unused_columns=False,
)


# === Run LIMA Fine-Tuning ===
log.info("\n--- Starting LIMA Fine-Tuning ---")
# The model object will be updated with the fine-tuned weights
trainer = llm_training.sft_train_on_dataset(
    model=model,
    tokenizer=tokenizer,
    log=log,
    train_dataset=training_ds,
    train_cfg=lima_training_config,
    train=True,
    use_liger_loss = True
)

2025-07-08 15:22:38 - INFO - [__main__] - 
--- Starting LIMA Fine-Tuning ---
2025-07-08 15:22:38 - INFO - [__main__] - Starting SFT training run...


False


Adding EOS to train dataset: 100%|██████████| 18041/18041 [00:00<00:00, 75456.35 examples/s]
Tokenizing train dataset: 100%|██████████| 18041/18041 [00:02<00:00, 7937.24 examples/s]
Packing train dataset: 100%|██████████| 18041/18041 [00:00<00:00, 132959.14 examples/s]
2025-07-08 15:22:40 - INFO - [liger_kernel.transformers.monkey_patch] - Applying Liger kernels to model instance with model type: qwen2 with kwargs: {}


Applied Liger kernels to Qwen2


Step,Training Loss
10,2.9552
20,1.719
30,0.8839
40,0.4993
50,0.4076
60,0.3659
70,0.352
80,0.3462


2025-07-08 15:24:00 - INFO - [__main__] - SFT training complete.
[34m[1mwandb[0m: [32m[41mERROR[0m The nbformat package was not found. It is required to save notebook history.


0,1
train/epoch,▁▂▃▄▅▆▇██
train/global_step,▁▂▃▄▅▆▇██
train/grad_norm,█▄▃▃▂▁▁▁
train/learning_rate,▃▆█▇▅▄▂▁
train/loss,█▅▂▁▁▁▁▁
train/num_tokens,▁▂▃▄▅▆▇██

0,1
total_flos,2802151099368192.0
train/epoch,1.0
train/global_step,83.0
train/grad_norm,6.03125
train/learning_rate,0.0
train/loss,0.3462
train/num_tokens,1304911.0
train_loss,0.91965
train_runtime,79.4311
train_samples_per_second,16.58


In [4]:
import pandas as pd
from datasets import Dataset

# ---- 1.  Your starting DataFrame (df) ----
# df = pd.read_csv(...)   # or however you loaded it

# ---- 2.  Build the concatenated text for every row ----
def row_to_text(row):
    return (
        f"[Drug SMILE] {row['Drug']} "
        f"[Target] {row['Target_ID']} "
        f"[Binding Affinity] {row['Y']:}"
    )
test_df["text"] = test_df.apply(row_to_text, axis=1)

# ---- 3.  Keep only the 'text' column and convert to a Dataset ----
test_ds = Dataset.from_pandas(test_df[["text"]], preserve_index=False)

print(test_ds[0]["text"])
# '[Drug SMILE] Cc1[nH]nc2ccc(-c3cncc(OCC(N)Cc4ccccc4)c3)cc12 [Target] AAK1 [Binding Affinity] 7.37'

[Drug SMILE] CC(O)C(=O)O.CN1CCN(c2ccc3c(c2)NC(=C2C(=O)N=c4cccc(F)c4=C2N)N3)CC1.O [Target] QSK [Binding Affinity] 4.999995657076895


In [29]:
import re
from tqdm import tqdm
import numpy as np

inference_cfg = llm_configs.InferenceConfig(
    temperature=0,
    repetition_penalty=1,
    max_new_tokens=32,   # 32 is plenty for a single number
)

# regular expressions
row_pat   = re.compile(
    r"\[Drug SMILE]\s+(.*?)\s+\[Target]\s+(.*?)\s+\[Binding Affinity]\s+([-+]?\d*\.?\d+)"
)
num_pat   = re.compile(r"[-+]?\d*\.?\d+(?:[eE][-+]?\d+)?")    # catch first float in the generation

targets, preds = [], []

for row in tqdm(test_ds["text"], desc="Inference on test set"):
    m = row_pat.match(row)
    if m is None:
        # skip badly-formatted rows
        continue

    drug_smiles, target_id, gt_aff_str = m.groups()
    gt_aff = float(gt_aff_str)

    prompt = f"[Drug SMILE] {drug_smiles} [Target] {target_id} [Binding Affinity] "

    gen_text = llm_training.generate_text(model, tokenizer, prompt, inference_cfg)

    num_match = num_pat.search(gen_text)
    if num_match is None:
        # model didn’t output a float we can parse → skip
        continue

    pred_aff = float(num_match.group())

    targets.append(gt_aff)
    preds.append(pred_aff)

# ------------------
# 2. compute MSE
# ------------------
targets = np.array(targets, dtype=np.float32)
preds   = np.array(preds,   dtype=np.float32)

mse = np.mean((preds - targets) ** 2)
print(f"\nMSE on {len(targets)} examples: {mse:.4f}")

Inference on test set:   0%|          | 0/5154 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Inference on test set:   0%|          | 1/5154 [00:00<22:17,  3.85it/s]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Inference on test set:   0%|          | 2/5154 [00:00<21:55,  3.92it/s]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Inference on test set:   0%|          | 3/5154 [00:00<21:48,  3.94it/s]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Inference on test set:   0%|          | 4/5154 [00:01<21:45,  3.95it/s]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Inference on test set:   0%|          | 5/5154 [00:01<21:44,  3.95it/s]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Inference on test set:   0%|          | 6/5154 [00:01<21:43,  3.95it/s]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.



MSE on 5154 examples: 19.8714





In [5]:
# === Cell 1: Configuration ===
model_config = llm_configs.ModelConfig(
    id="Qwen/Qwen2.5-0.5B",
    peft=llm_configs.PeftConfig(
        enabled=False,
        add_eot_token=False,  # No longer doing EOT token for LIMA
    ),
    quantization=llm_configs.QuantizationConfig(mode=None), # Use QLoRA
)

log.info("--- Configuration ---")
print(model_config.model_dump_json(indent=2))

log.info("\n--- Loading Model for Training ---")
model, tokenizer = llm_training.load_model_for_training(model_config, log)

2025-07-08 16:06:11 - INFO - [__main__] - --- Configuration ---
2025-07-08 16:06:11 - INFO - [__main__] - 
--- Loading Model for Training ---
2025-07-08 16:06:11 - INFO - [__main__] - Loading model 'Qwen/Qwen2.5-0.5B' for training...


{
  "id": "Qwen/Qwen2.5-0.5B",
  "torch_dtype": "auto",
  "attn_implementation": "flash_attention_2",
  "peft": {
    "enabled": false,
    "lora_r": 16,
    "lora_alpha": 32,
    "lora_dropout": 0.05,
    "target_modules": [
      "q_proj",
      "k_proj",
      "v_proj",
      "o_proj",
      "gate_proj",
      "up_proj",
      "down_proj"
    ],
    "add_eot_token": false
  },
  "quantization": {
    "mode": null
  }
}


2025-07-08 16:06:12 - INFO - [__main__] - Model and tokenizer loaded successfully.


In [None]:
lima_training_config = llm_configs.TrainingConfig(
    run_name = "finetuning raw qwen on TDC DAVIS",
    num_train_epochs = 1,
    learning_rate  = 4e-5,
    logging_strategy = "steps", 
    logging_steps = 10,
    gradient_checkpointing=False,
    context_length = 1024,
    use_liger_kernel=True,
    per_device_train_batch_size = 16,
    gradient_accumulation_steps=1,
    # warmup_steps  = 0, # LIMA specifies no warmup, so we set this explicitly
    warmup_ratio = 0.3, # Use our default warmup ratio instead
    packing=True,
    padding_free = True,
    sequential_sampling = False,
    reverse_ffd_packing= False,
    remove_unused_columns=False,
)


# === Run LIMA Fine-Tuning ===
log.info("\n--- Starting LIMA Fine-Tuning ---")
# The model object will be updated with the fine-tuned weights
trainer = llm_training.sft_train_on_dataset(
    model=model,
    tokenizer=tokenizer,
    log=log,
    train_dataset=training_ds,
    train_cfg=lima_training_config,
    train=True,
    use_liger_loss = True
)

2025-07-08 16:06:16 - INFO - [__main__] - 
--- Starting LIMA Fine-Tuning ---
2025-07-08 16:06:16 - INFO - [__main__] - Starting SFT training run...


False


Adding EOS to train dataset:   0%|          | 0/18041 [00:00<?, ? examples/s]

Tokenizing train dataset:   0%|          | 0/18041 [00:00<?, ? examples/s]

In [None]:
import re
from tqdm import tqdm
import numpy as np

inference_cfg = llm_configs.InferenceConfig(
    temperature=0,
    repetition_penalty=1,
    max_new_tokens=32,   # 32 is plenty for a single number
)

# regular expressions
row_pat   = re.compile(
    r"\[Drug SMILE]\s+(.*?)\s+\[Target]\s+(.*?)\s+\[Binding Affinity]\s+([-+]?\d*\.?\d+)"
)
num_pat   = re.compile(r"[-+]?\d*\.?\d+(?:[eE][-+]?\d+)?")    # catch first float in the generation

targets, preds = [], []

for row in tqdm(test_ds["text"], desc="Inference on test set"):
    m = row_pat.match(row)
    if m is None:
        # skip badly-formatted rows
        continue

    drug_smiles, target_id, gt_aff_str = m.groups()
    gt_aff = float(gt_aff_str)

    prompt = f"[Drug SMILE] {drug_smiles} [Target] {target_id} [Binding Affinity] "

    gen_text = llm_training.generate_text(model, tokenizer, prompt, inference_cfg)

    num_match = num_pat.search(gen_text)
    if num_match is None:
        # model didn’t output a float we can parse → skip
        continue

    pred_aff = float(num_match.group())

    targets.append(gt_aff)
    preds.append(pred_aff)

# ------------------
# 2. compute MSE
# ------------------
targets = np.array(targets, dtype=np.float32)
preds   = np.array(preds,   dtype=np.float32)

mse = np.mean((preds - targets) ** 2)
print(f"\nMSE on {len(targets)} examples: {mse:.4f}")