# Imports

In [30]:
import pandas as pd
from datasets import load_dataset, Dataset, DatasetDict
from transformers import (
    AutoTokenizer, AutoModelForCausalLM,
    TrainingArguments, Trainer,
    DataCollatorForLanguageModeling
)
from peft import PeftModel, PeftConfig, get_peft_model, LoraConfig, TaskType, prepare_model_for_kbit_training
from transformers import BitsAndBytesConfig
import torch
import wandb

# Configs

In [31]:
model_path = "../models/phi_pubmed_pretrained_attempt_1/final"

data_path = "../data/pubmed_baseline/"
test_data_path = data_path + "pubmed_test.csv"

max_len = 300
batch_size = 32

In [32]:
wandb.init(project="pubmed-pretrain-evaluation", name="attempt_2")

0,1
eval_loss,▁
eval_perplexity,▁

0,1
eval_loss,1.63262
eval_perplexity,5.11726


# Dataset

In [33]:
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)

In [34]:
def tokenize_dataset(tokenizer, data_df):
    dataset = Dataset.from_pandas(data_df)
    def tokenize(example):
        text = f"<s>#{example['title']}\n{example['abstract']}</s>"
        return tokenizer(text, truncation=True, padding="max_length", max_length=max_len, return_attention_mask=True)
    dataset = dataset.map(tokenize, batched=False)
    return dataset

In [35]:
test_df = pd.read_csv(test_data_path)

test_set = tokenize_dataset(tokenizer, test_df)

Map: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3000/3000 [00:02<00:00, 1279.84 examples/s]


# Model

In [36]:
peft_config = PeftConfig.from_pretrained(model_path)
base_model_name = peft_config.base_model_name_or_path

In [37]:
base_model_name

'microsoft/Phi-3.5-mini-instruct'

In [38]:
# === Quantized model loading ===
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

model = AutoModelForCausalLM.from_pretrained(
    base_model_name,
    quantization_config=bnb_config,
    device_map="auto",
    trust_remote_code=True
)

Loading checkpoint shards: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:02<00:00,  1.23s/it]


In [39]:
model = PeftModel.from_pretrained(model, model_path)
model.eval() 

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): Phi3ForCausalLM(
      (model): Phi3Model(
        (embed_tokens): Embedding(32064, 3072, padding_idx=32000)
        (embed_dropout): Dropout(p=0.0, inplace=False)
        (layers): ModuleList(
          (0-31): 32 x Phi3DecoderLayer(
            (self_attn): Phi3Attention(
              (o_proj): lora.Linear4bit(
                (base_layer): Linear4bit(in_features=3072, out_features=3072, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.05, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=3072, out_features=8, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=8, out_features=3072, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (lora_magnitud

# Test set evaluation

In [40]:
training_args = TrainingArguments(
    output_dir="./eval_output",
    per_device_eval_batch_size=batch_size,
    do_eval=True,
    report_to="none"
)

trainer = Trainer(
    model=model,
    args=training_args,
    tokenizer=tokenizer,
    data_collator=DataCollatorForLanguageModeling(tokenizer, mlm=False)
)

  trainer = Trainer(
No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


In [41]:
# === Evaluate perplexity ===
eval_result = trainer.evaluate(test_set)

In [42]:
loss = eval_result["eval_loss"]
perplexity = torch.exp(torch.tensor(loss))

print(f"\n✅ Evaluation Metrics:")
print(f"Eval Loss     : {loss:.4f}")
print(f"Eval Perplexity: {perplexity:.2f}")


✅ Evaluation Metrics:
Eval Loss     : 1.6326
Eval Perplexity: 5.12


In [43]:
wandb.log({"eval_loss": loss, "eval_perplexity": perplexity.item()})

# Inference

In [28]:
!pip show transformers

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Name: transformers
Version: 4.51.3
Summary: State-of-the-art Machine Learning for JAX, PyTorch and TensorFlow
Home-page: https://github.com/huggingface/transformers
Author: The Hugging Face team (past and future) with the help of all our contributors (https://github.com/huggingface/transformers/graphs/contributors)
Author-email: transformers@huggingface.co
License: Apache 2.0 License
Location: /home/super_admin/hasindu/myenv/lib/python3.10/site-packages
Requires: filelock, huggingface-hub, numpy, packaging, pyyaml, regex, requests, safetensors, tokenizers, tqdm
Required-by: peft


In [29]:
pip install transformers==4.40

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting transformers==4.40
  Downloading transformers-4.40.0-py3-none-any.whl (9.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.0/9.0 MB[0m [31m38.6 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Collecting tokenizers<0.20,>=0.19
  Downloading tokenizers-0.19.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.6/3.6 MB[0m [31m69.3 MB/s[0m eta [36m0:00:00[0m:00:01[0m
Installing collected packages: tokenizers, transformers
  Attempting uninstall: tokenizers
    Found existing installation: tokenizers 0.21.1
    Uninstalling tokenizers-0.21.1:
      Successfully uninstalled tokenizers-0.21.1
  Attempting uninstall: transformers
    Found existing installation: transformers 4.51.3
    Uninstalling transformers-4.51.3:
      Successfully uninstalled transformers-4.51.3
Successfully installed tokenizers-0.19.1 transformers-4.40.0
Note: you may need to restart the kernel to

In [45]:
samples = test_set.select(range(5))  # First 5 examples
input_ids = torch.tensor(samples["input_ids"]).to(model.device)
attention_mask = torch.tensor(samples["attention_mask"]).to(model.device)

generated_ids = model.generate(
    input_ids=input_ids,
    attention_mask=attention_mask,
    max_new_tokens=128,
    do_sample=False,
    use_cache=False
)

generated_texts = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)

# Log predictions to W&B
wandb_table = wandb.Table(columns=["Title", "Actual Abstract", "Generated Text"])
for i, gen in enumerate(generated_texts):
    title = samples[i]["title"]
    actual = samples[i]["abstract"]
    print(f"\nTitle: {title}\n---\nActual Abstract: {actual}\n---\nGenerated: {gen}\n")
    wandb_table.add_data(title, actual, gen)


wandb.log({"generated_examples": wandb_table})


Title: Doxorubicin-vincristine therapy for Wilms' tumor: a pilot study.
---
Actual Abstract: Doxorubicin plus vincristine chemotherapy was given to 31 children following nephrectomy for Wilms' tumor. Radiation therapy was used as indicated. Disease-free survival by stage is: eight of nine patients (stage I), eight of nine (stage II), nine of ten (stage III), and two of three (stage IV). Median follow-up of survivors is 28 months (range, 2-67); for all but four patients, follow-up is greater than 12 months. Two of the three stage I-III failures occurred in children with unfavorable histologies; the third failure was due to fatal anthracycline cardiomyopathy. Lowering the maximal cumulative doxorubicin dose from 450 to 240 mg/m2 did not increase failures. Doxorubicin-vincristine appears to be effective chemotherapy for Wilms' tumor.
---
Generated: #Doxorubicin-vincristine therapy for Wilms' tumor: a pilot study.
Doxorubicin plus vincristine chemotherapy was given to 31 children followin

NameError: name 'use_wandb' is not defined

In [46]:
wandb.log({"generated_examples": wandb_table})

## Base model comparison

In [49]:
base_model = AutoModelForCausalLM.from_pretrained(
    base_model_name,
    device_map="auto",
    trust_remote_code=True
)
base_model.eval()

Loading checkpoint shards: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:01<00:00,  1.81it/s]


Phi3ForCausalLM(
  (model): Phi3Model(
    (embed_tokens): Embedding(32064, 3072, padding_idx=32000)
    (embed_dropout): Dropout(p=0.0, inplace=False)
    (layers): ModuleList(
      (0-31): 32 x Phi3DecoderLayer(
        (self_attn): Phi3Attention(
          (o_proj): Linear(in_features=3072, out_features=3072, bias=False)
          (qkv_proj): Linear(in_features=3072, out_features=9216, bias=False)
          (rotary_emb): Phi3LongRoPEScaledRotaryEmbedding()
        )
        (mlp): Phi3MLP(
          (gate_up_proj): Linear(in_features=3072, out_features=16384, bias=False)
          (down_proj): Linear(in_features=8192, out_features=3072, bias=False)
          (activation_fn): SiLU()
        )
        (input_layernorm): Phi3RMSNorm()
        (resid_attn_dropout): Dropout(p=0.0, inplace=False)
        (resid_mlp_dropout): Dropout(p=0.0, inplace=False)
        (post_attention_layernorm): Phi3RMSNorm()
      )
    )
    (norm): Phi3RMSNorm()
  )
  (lm_head): Linear(in_features=3072, out

In [50]:
training_args = TrainingArguments(
    output_dir="./eval_output_base",
    per_device_eval_batch_size=batch_size,
    do_eval=True,
    report_to="none"
)

base_model_trainer = Trainer(
    model=base_model,
    args=training_args,
    tokenizer=tokenizer,
    data_collator=DataCollatorForLanguageModeling(tokenizer, mlm=False)
)

base_model_eval_result = base_model_trainer.evaluate(test_set)

  base_model_trainer = Trainer(


In [51]:
loss = base_model_eval_result["eval_loss"]
perplexity = torch.exp(torch.tensor(loss))

print(f"\n✅ Evaluation Metrics:")
print(f"Eval Loss     : {loss:.4f}")
print(f"Eval Perplexity: {perplexity:.2f}")


✅ Evaluation Metrics:
Eval Loss     : 1.8892
Eval Perplexity: 6.61


In [52]:
wandb.log({"base_model_eval_loss": loss, "base_model_eval_perplexity": perplexity.item()})

In [60]:
def generate(model, text, max_new_tokens=128):
    sample = tokenizer(text, truncation=True, padding="max_length", max_length=max_len, return_attention_mask=True)
    input_ids = torch.tensor([sample["input_ids"]]).to(model.device)
    attention_mask = torch.tensor([sample["attention_mask"]]).to(model.device)
    
    generated_ids = model.generate(
        input_ids=input_ids,
        attention_mask=attention_mask,
        max_new_tokens=max_new_tokens,
        do_sample=False,
        use_cache=False
    )
    
    generated_texts = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
    return generated_texts[0]
    # # Log predictions to W&B
    # for i, gen in enumerate(generated_texts):
    #     title = samples[i]["title"]
    #     actual = samples[i]["abstract"]
    #     print(f"\nTitle: {title}\n---\nActual Abstract: {actual}\n---\nGenerated: {gen}\n")
    #     wandb_table.add_data(title, actual, gen)
    
    
    # wandb.log({"generated_examples": wandb_table})

In [61]:
generate(model, "# The relationship between diabetes and blood pressure\n")

'# The relationship between diabetes and blood pressure\nDiabetes and hypertension are two of the most common chronic diseases in the world. The prevalence of diabetes is increasing rapidly, and the prevalence of hypertension is also increasing. The two diseases are closely related. The prevalence of hypertension is higher in patients with diabetes than in the general population. The prevalence of diabetes is higher in patients with hypertension than in the general population. The prevalence of diabetes is higher in patients with hypertension than in the general population. The prevalence of diabetes'

In [62]:
generate(base_model, "# The relationship between diabetes and blood pressure\n")

"# The relationship between diabetes and blood pressure\n\n# Importing the libraries\nimport numpy as np\nimport matplotlib.pyplot as plt\nimport pandas as pd\n\n# Importing the dataset\ndataset = pd.read_csv('diabetes.csv')\nX = dataset.iloc[:, :-1].values\ny = dataset.iloc[:, -1].values\n\n# Splitting the dataset into the Training set and Test set\nfrom sklearn.model_selection import train_test_split\nX_train, X_test, y_train, y_test = train_test_split(X, y,"