# Imports

In [1]:
import pandas as pd
import numpy as np
from datasets import load_dataset, Dataset, DatasetDict
from transformers import (
    AutoTokenizer, AutoModelForCausalLM,
    TrainingArguments, Trainer,
    DataCollatorForLanguageModeling
)
from peft import PeftModel, get_peft_model, LoraConfig, TaskType, prepare_model_for_kbit_training
from transformers import BitsAndBytesConfig
import torch
import wandb

from utils import tokenize_dataset_for_qna

  from .autonotebook import tqdm as notebook_tqdm


# Configs

In [2]:
data_path = "../data/qna/"
train_data_path = data_path + "train.csv"
val_data_path = data_path + "val.csv"

max_len = 512

base_model_path = "../models/phi_pubmed_pretrained_attempt_3/final_pretrained"

model_id = "microsoft/Phi-3.5-mini-instruct"

model_output_dir = "../models/phi_qna_finetuned_attempt_1"

# Hyperparameters

In [3]:
lora_r = 16
lora_alpha = 32
lora_target_modules = ["q_proj", "v_proj", "o_proj"]
batch_size = 32
quantization = None
lora_dropout = 0.05
epochs = 5
learning_rate = 5e-5

# Dataset

In [4]:
prompt_template = """
# Instruction:
Assume you are an excellent doctor. Using your knowledge, answer the quesion given below.

# Question: {question}

# Answer: """
prompt_template = prompt_template.strip()
print(prompt_template)

# Instruction:
Assume you are an excellent doctor. Using your knowledge, answer the quesion given below.

# Question: {question}

# Answer:


In [8]:
def tokenize(example):
    prompt = prompt_template.format(question=example['question'])
    answer = example["answer"] + tokenizer.eos_token
    full_text = prompt + answer
    
    # Tokenize prompt to get its length
    prompt_tokens = tokenizer(
        prompt,
        truncation=False
    )
    
    prompt_len = len(prompt_tokens["input_ids"])

    # Tokenize full sequence once to get the total token count
    tokenized = tokenizer(
        full_text,
        truncation=True,
        max_length=max_len
    )
    
    # Convert to numpy arrays for faster operations
    full_text_len = len(tokenized["input_ids"])
    
    # Tokenize full sequence once
    tokenized = tokenizer(
        full_text,
        truncation=True,
        max_length=max_len,
        padding="max_length",
        return_attention_mask=True    
    )
    
    # Convert to numpy arrays for faster operations
    input_ids = np.array(tokenized["input_ids"])
    attention_mask = np.array(tokenized["attention_mask"])
    
    # Create labels array and mask prompt portion efficiently
    labels = input_ids.copy()
    # Mask the prompt tokens
    padding_len = max_len - full_text_len
    labels[padding_len:padding_len + prompt_len] = -100
    # print(full_text_len, padding_len, prompt_len)
    
    # Update the tokenized dict with numpy arrays
    tokenized["input_ids"] = input_ids.tolist()
    tokenized["attention_mask"] = attention_mask.tolist()
    tokenized["labels"] = labels.tolist()
    
    return tokenized

def tokenize_dataset(tokenizer, data_df):
    dataset = Dataset.from_pandas(data_df)
    dataset = dataset.map(tokenize, batched=False)
    return dataset

In [5]:
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)

In [None]:
tokenizer([tokenizer.pad_token])

In [7]:
val_df = pd.read_csv(val_data_path)
train_df = pd.read_csv(train_data_path)

val_set = tokenize_dataset_for_qna(tokenizer, val_df, prompt_template, max_len)
train_set = tokenize_dataset_for_qna(tokenizer, train_df, prompt_template, max_len)

Map: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 300/300 [00:00<00:00, 608.28 examples/s]
Map: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 110000/110000 [03:11<00:00, 573.41 examples/s]


In [8]:
wandb.init(
    project="med-qna-finetune",
    name="attempt_1",
    config={
        "model": model_id,
        "lora_r": lora_r,
        "lora_alpha": lora_alpha,
        "batch_size": batch_size,
        "epochs": epochs,
        "quantization": quantization,
        "lora_target_modules": lora_target_modules
    }
)

[34m[1mwandb[0m: [32m[41mERROR[0m Failed to detect the name of this notebook. You can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mhasindumadushan325[0m ([33mhasindumadushan325-university-of-peradeniya[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [None]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit=quantization=="4bit",
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

In [None]:
# base_model = AutoModelForCausalLM.from_pretrained(
#     model_id,
#     # quantization_config=bnb_config,
#     device_map="auto",
#     trust_remote_code=True
# )

# model = PeftModel.from_pretrained(base_model, "../models/phi_pubmed_pretrained_attempt_3/final")
# model = model.merge_and_unload()

In [None]:
# model.save_pretrained("../models/phi_pubmed_pretrained_attempt_3/final_pretrained")

In [9]:
model = AutoModelForCausalLM.from_pretrained(
    base_model_path,
    # quantization_config=bnb_config,
    device_map="auto",
    trust_remote_code=True
)

`flash-attention` package not found, consider installing for better performance: No module named 'flash_attn'.
Current `flash-attention` does not support `window_size`. Either upgrade or use `attn_implementation='eager'`.
Loading checkpoint shards: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:02<00:00,  1.97it/s]


In [10]:
model.gradient_checkpointing_enable()
# model = prepare_model_for_kbit_training(model)

In [11]:
lora_config = LoraConfig(
    r=lora_r,
    lora_alpha=lora_alpha,
    target_modules=lora_target_modules,
    lora_dropout=lora_dropout,
    bias="none",
    task_type=TaskType.CAUSAL_LM
)

model = get_peft_model(model, lora_config)

# Train

In [12]:
training_args = TrainingArguments(
    output_dir=model_output_dir,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=epochs,
    eval_strategy="epoch",  # ✅ eval at each epoch
    save_strategy="epoch",
    logging_steps=50,
    learning_rate=learning_rate,
    fp16=True,
    report_to="wandb",
    run_name="attempt_1",
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss"
)

In [13]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_set,
    eval_dataset=val_set,
    tokenizer=tokenizer,
    data_collator=DataCollatorForLanguageModeling(tokenizer, mlm=False)
)

  trainer = Trainer(
No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


In [None]:
trainer.train()

trainer.save_model(model_output_dir + "/final")
tokenizer.save_pretrained(model_output_dir + "/final")

`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...
You are not running the flash-attention implementation, expect numerical differences.


Epoch,Training Loss,Validation Loss


## Merge model with lora weights

In [None]:
base_model = AutoModelForCausalLM.from_pretrained(
    model_id,
    # quantization_config=bnb_config,
    device_map="auto",
    trust_remote_code=True
)

finetuned_model = PeftModel.from_pretrained(base_model, output_model_dir + "/final")
merged_model = finetuned_model.merge_and_unload()

In [None]:
model.save_pretrained(output_model_dir +  "/final_pretrained")