# Imports

In [None]:
!pip install absl-py rouge-score nltk

In [None]:
!python -m nltk.downloader punkt

In [1]:
import pandas as pd
from datasets import load_dataset, Dataset, DatasetDict
from transformers import (
    AutoTokenizer, AutoModelForCausalLM,
    TrainingArguments, Trainer,
    DataCollatorForLanguageModeling
)
from peft import PeftModel, PeftConfig, get_peft_model, LoraConfig, TaskType, prepare_model_for_kbit_training
from transformers import BitsAndBytesConfig
import torch
import wandb
import evaluate  # Hugging Face's evaluate library
import numpy as np
import torch

  from .autonotebook import tqdm as notebook_tqdm


# Configs

In [2]:
model_path = "../models/phi_pubmed_pretrained_attempt_3/final"

data_path = "../data/pubmed_baseline/"
test_data_path = data_path + "pubmed_test.csv"

model_id = "microsoft/Phi-3.5-mini-instruct"

max_len = 300
batch_size = 8

In [3]:
wandb.init(project="pubmed-pretrain-evaluation", name="base_model")

[34m[1mwandb[0m: [32m[41mERROR[0m Failed to detect the name of this notebook. You can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mhasindumadushan325[0m ([33mhasindumadushan325-university-of-peradeniya[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


# Dataset

In [4]:
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)

In [5]:
def tokenize_dataset(tokenizer, data_df):
    dataset = Dataset.from_pandas(data_df)
    def tokenize(example):
        text = f"<s>{example['title']}\n{example['abstract']}</s>"
        return tokenizer(text, truncation=True, padding="max_length", max_length=max_len, return_attention_mask=True)
    dataset = dataset.map(tokenize, batched=False)
    return dataset

In [6]:
test_df = pd.read_csv(test_data_path)

test_set = tokenize_dataset(tokenizer, test_df.iloc[:2000, :])

Map: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2000/2000 [00:01<00:00, 1337.51 examples/s]


# Model

In [7]:
# === Quantized model loading ===
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

model = AutoModelForCausalLM.from_pretrained(
    model_id,
    # quantization_config=bnb_config,
    device_map="auto",
    trust_remote_code=True
)

`flash-attention` package not found, consider installing for better performance: No module named 'flash_attn'.
Current `flash-attention` does not support `window_size`. Either upgrade or use `attn_implementation='eager'`.
Loading checkpoint shards: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:02<00:00,  1.84it/s]


In [8]:
model = PeftModel.from_pretrained(model, model_path)
model.eval() 

Phi3ForCausalLM(
  (model): Phi3Model(
    (embed_tokens): Embedding(32064, 3072, padding_idx=32000)
    (embed_dropout): Dropout(p=0.0, inplace=False)
    (layers): ModuleList(
      (0-31): 32 x Phi3DecoderLayer(
        (self_attn): Phi3Attention(
          (o_proj): Linear(in_features=3072, out_features=3072, bias=False)
          (qkv_proj): Linear(in_features=3072, out_features=9216, bias=False)
          (rotary_emb): Phi3LongRoPEScaledRotaryEmbedding()
        )
        (mlp): Phi3MLP(
          (gate_up_proj): Linear(in_features=3072, out_features=16384, bias=False)
          (down_proj): Linear(in_features=8192, out_features=3072, bias=False)
          (activation_fn): SiLU()
        )
        (input_layernorm): Phi3RMSNorm()
        (resid_attn_dropout): Dropout(p=0.0, inplace=False)
        (resid_mlp_dropout): Dropout(p=0.0, inplace=False)
        (post_attention_layernorm): Phi3RMSNorm()
      )
    )
    (norm): Phi3RMSNorm()
  )
  (lm_head): Linear(in_features=3072, out

### Base model

In [9]:
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    device_map="auto",
    trust_remote_code=True
)
model.eval()

`flash-attention` package not found, consider installing for better performance: No module named 'flash_attn'.
Current `flash-attention` does not support `window_size`. Either upgrade or use `attn_implementation='eager'`.
Loading checkpoint shards: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:01<00:00,  1.85it/s]


Phi3ForCausalLM(
  (model): Phi3Model(
    (embed_tokens): Embedding(32064, 3072, padding_idx=32000)
    (embed_dropout): Dropout(p=0.0, inplace=False)
    (layers): ModuleList(
      (0-31): 32 x Phi3DecoderLayer(
        (self_attn): Phi3Attention(
          (o_proj): Linear(in_features=3072, out_features=3072, bias=False)
          (qkv_proj): Linear(in_features=3072, out_features=9216, bias=False)
          (rotary_emb): Phi3LongRoPEScaledRotaryEmbedding()
        )
        (mlp): Phi3MLP(
          (gate_up_proj): Linear(in_features=3072, out_features=16384, bias=False)
          (down_proj): Linear(in_features=8192, out_features=3072, bias=False)
          (activation_fn): SiLU()
        )
        (input_layernorm): Phi3RMSNorm()
        (resid_attn_dropout): Dropout(p=0.0, inplace=False)
        (resid_mlp_dropout): Dropout(p=0.0, inplace=False)
        (post_attention_layernorm): Phi3RMSNorm()
      )
    )
    (norm): Phi3RMSNorm()
  )
  (lm_head): Linear(in_features=3072, out

In [None]:
# training_args = TrainingArguments(
#     output_dir="./eval_output_base",
#     per_device_eval_batch_size=batch_size,
#     do_eval=True,
#     report_to="none"
# )

# base_model_trainer = Trainer(
#     model=base_model,
#     args=training_args,
#     tokenizer=tokenizer,
#     data_collator=DataCollatorForLanguageModeling(tokenizer, mlm=False)
# )

# base_model_eval_result = base_model_trainer.evaluate(test_set)

# Test set evaluation

In [9]:
training_args = TrainingArguments(
    output_dir="./eval_output",
    per_device_eval_batch_size=batch_size,
    do_eval=True,
    report_to="none",
    eval_accumulation_steps=2,
)

In [11]:


# Load metrics
bleu_metric = evaluate.load("bleu")
rouge_metric = evaluate.load("rouge")

def compute_metrics(eval_preds):
    """ Calculate bleu and rouge scores """
    with torch.no_grad():
        logits, labels = eval_preds
        
        logits = logits.cpu().numpy() if torch.is_tensor(logits) else logits
        
        labels = labels.cpu().numpy() if torch.is_tensor(labels) else labels
        
        # Get predicted token IDs (argmax of logits)
        pred_ids = np.argmax(logits, axis=-1)  # Shape: (batch_size, seq_length)
        
        # Decode predictions and labels
        pred_str = tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
        
        # Replace -100 with pad_token_id in labels
        labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
        label_str = tokenizer.batch_decode(labels, skip_special_tokens=True)

        references = [[ref] for ref in label_str]

        
        # Compute BLEU
        bleu_result = bleu_metric.compute(
            predictions=pred_str,
            references=references
        )
        
        # Compute ROUGE
        rouge_result = rouge_metric.compute(
            predictions=pred_str,
            references=label_str,
            use_stemmer=True
        )
        
        # Extract main scores
        metrics = {
            'bleu': bleu_result['bleu'],
            'rouge1': rouge_result['rouge1'],
            'rouge2': rouge_result['rouge2'],
            'rougeL': rouge_result['rougeL'],
        }
        torch.cuda.empty_cache()
        
        return metrics


trainer = Trainer(
    model=model,
    args=training_args,
    tokenizer=tokenizer,
    data_collator=DataCollatorForLanguageModeling(tokenizer, mlm=False),
    compute_metrics=compute_metrics
)

# Then when you evaluate




  trainer = Trainer(


In [12]:
# === Evaluate perplexity ===
eval_result = trainer.evaluate(test_set)

You are not running the flash-attention implementation, expect numerical differences.


In [13]:
# Print all results
print("\nEvaluation Metrics:")
print(f"Loss: {eval_result['eval_loss']:.4f}")
print(f"Perplexity: {torch.exp(torch.tensor(eval_result['eval_loss'])):.2f}")
print(f"BLEU: {eval_result['eval_bleu']:.4f}")
print(f"ROUGE-1: {eval_result['eval_rouge1']:.4f}")
print(f"ROUGE-2: {eval_result['eval_rouge2']:.4f}")
print(f"ROUGE-L: {eval_result['eval_rougeL']:.4f}")


Evaluation Metrics:
Loss: 1.8662
Perplexity: 6.46
BLEU: 0.1632
ROUGE-1: 0.5243
ROUGE-2: 0.2098
ROUGE-L: 0.4334


In [14]:
wandb.log({
    "eval_loss": eval_result['eval_loss'], 
    "perplexity": torch.exp(torch.tensor(eval_result['eval_loss'])),
    "BLUE": eval_result['eval_bleu'],
    "ROUGE_1": eval_result['eval_rouge1'],
    "ROUGE_2": eval_result['eval_rouge2'],
    "ROUGE_L": eval_result['eval_rougeL']    
})

# Inference

In [21]:
samples = test_set.select(range(433, 439))  # First 5 examples
input_ids = torch.tensor(samples["input_ids"]).to(model.device)
attention_mask = torch.tensor(samples["attention_mask"]).to(model.device)

generated_ids = model.generate(
    input_ids=input_ids,
    attention_mask=attention_mask,
    max_new_tokens=128,
    do_sample=False,
    use_cache=False
)

generated_texts = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)

# Log predictions to W&B
wandb_table = wandb.Table(columns=["Title", "Actual Abstract", "Generated Text"])
for i, gen in enumerate(generated_texts):
    title = samples[i]["title"]
    actual = samples[i]["abstract"]
    print(f"\nActual: {title}\n{actual}\n---\nGenerated: {gen}\n")
    wandb_table.add_data(title, actual, gen)





Actual: The elastase activity of alveolar macrophages: measurements using synthetic substrates and elastin.
Hamster, rat, guinea pig, and rabbit alveolar macrophage extracts were tested for elastase activity using elastin suspended in agar and two synthetic substrates, p-nitrophenyl N-tert-butyloxycarbonyl L-alaninate (NBA) and succinyl-L-alanyl-L-alanyl-L-alanine-p-nitroanilide (SLAPN). Activity against NBA was easily detectable, but there was no activity against SLAPN or against elastin-agar, although the assay procedures employing these substrates measured as little as 50 and 2 ng of pancreatic elastase, respectively. We concluded that unstimulated alveolar macrophages from these animals do not contain elastase, and that NBA activity is misleading as an indicator of elastolytic activity in crude alveolar macrophage extracts from these species.
---
Generated: The elastase activity of alveolar macrophages: measurements using synthetic substrates and elastin.
Hamster, rat, guinea pig,

In [23]:
samples[0]

{'pmid': 1247212,
 'title': 'The elastase activity of alveolar macrophages: measurements using synthetic substrates and elastin.',
 'abstract': 'Hamster, rat, guinea pig, and rabbit alveolar macrophage extracts were tested for elastase activity using elastin suspended in agar and two synthetic substrates, p-nitrophenyl N-tert-butyloxycarbonyl L-alaninate (NBA) and succinyl-L-alanyl-L-alanyl-L-alanine-p-nitroanilide (SLAPN). Activity against NBA was easily detectable, but there was no activity against SLAPN or against elastin-agar, although the assay procedures employing these substrates measured as little as 50 and 2 ng of pancreatic elastase, respectively. We concluded that unstimulated alveolar macrophages from these animals do not contain elastase, and that NBA activity is misleading as an indicator of elastolytic activity in crude alveolar macrophage extracts from these species.',
 'input_ids': [32000,
  32000,
  32000,
  32000,
  32000,
  32000,
  32000,
  32000,
  32000,
  32000,

In [22]:
wandb.log({"generated_examples": wandb_table})

In [18]:
def generate(model, text, max_new_tokens=128):
    sample = tokenizer(text, truncation=True, padding="max_length", max_length=max_len, return_attention_mask=True)
    input_ids = torch.tensor([sample["input_ids"]]).to(model.device)
    attention_mask = torch.tensor([sample["attention_mask"]]).to(model.device)
    
    generated_ids = model.generate(
        input_ids=input_ids,
        attention_mask=attention_mask,
        max_new_tokens=max_new_tokens,
        do_sample=False,
        use_cache=False
    )
    
    generated_texts = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
    return generated_texts[0]
    # # Log predictions to W&B
    # for i, gen in enumerate(generated_texts):
    #     title = samples[i]["title"]
    #     actual = samples[i]["abstract"]
    #     print(f"\nTitle: {title}\n---\nActual Abstract: {actual}\n---\nGenerated: {gen}\n")
    #     wandb_table.add_data(title, actual, gen)
    
    
    # wandb.log({"generated_examples": wandb_table})

In [19]:
generated_text = generate(model, "The relationship between diabetes and blood pressure\n")
generated_text

"The relationship between diabetes and blood pressure\n\nDiabetes and high blood pressure often occur together, and managing both conditions is crucial for reducing the risk of complications. Here's how they are related:\n\n1. **Insulin Resistance**: Insulin resistance, a hallmark of type 2 diabetes, can lead to increased blood pressure. Insulin resistance can cause the body to retain sodium, which increases blood volume and, consequently, blood pressure.\n\n2. **Kidney Damage**: Diabetes can damage the blood vessels in the kidneys, impairing their ability"

In [20]:
wandb.log({"example_1": generated_text})

In [None]:
generate(base_model, "# The relationship between diabetes and blood pressure\n")