In [None]:
!pip install absl-py rouge-score nltk

In [None]:
!python -m nltk.downloader punkt

# Imports

In [2]:
import pandas as pd
from datasets import load_dataset, Dataset, DatasetDict
from transformers import (
    AutoTokenizer, AutoModelForCausalLM,
    TrainingArguments, Trainer,
    DataCollatorForLanguageModeling
)
from peft import PeftModel, PeftConfig, get_peft_model, LoraConfig, TaskType, prepare_model_for_kbit_training
from transformers import BitsAndBytesConfig
import torch
import wandb
import numpy as np
import torch

from evaluation_metrics import compute_metrics_for_pretrain
from utils import tokenize_pretrain_dataset

  from .autonotebook import tqdm as notebook_tqdm


# Configs

In [3]:
model_path = "../models/phi_pubmed_pretrained_attempt_5/checkpoint-6251"

data_path = "../data/pubmed_baseline/"
test_data_path = data_path + "pubmed_test.csv"

model_id = "microsoft/Phi-3.5-mini-instruct"

max_len = 360
batch_size = 8

# Dataset

In [4]:
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)

In [14]:
test_df = pd.read_csv(test_data_path)
test_set = tokenize_pretrain_dataset(tokenizer, test_df.iloc[:2000, :], max_len)

Map: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2000/2000 [00:01<00:00, 1425.84 examples/s]


In [None]:
test_set[3]

In [None]:
tokenizer.decode(test_set[0]["input_ids"], skip_special_tokens=False)

# Model

In [5]:
# === Quantized model loading ===
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

model = AutoModelForCausalLM.from_pretrained(
    model_id,
    # quantization_config=bnb_config,
    device_map="auto",
    trust_remote_code=True
)

`flash-attention` package not found, consider installing for better performance: No module named 'flash_attn'.
Current `flash-attention` does not support `window_size`. Either upgrade or use `attn_implementation='eager'`.
Loading checkpoint shards: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:01<00:00,  1.82it/s]


In [6]:
model = PeftModel.from_pretrained(model, model_path)
model.eval() 

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): Phi3ForCausalLM(
      (model): Phi3Model(
        (embed_tokens): Embedding(32064, 3072, padding_idx=32000)
        (embed_dropout): Dropout(p=0.0, inplace=False)
        (layers): ModuleList(
          (0-31): 32 x Phi3DecoderLayer(
            (self_attn): Phi3Attention(
              (o_proj): lora.Linear(
                (base_layer): Linear(in_features=3072, out_features=3072, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.05, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=3072, out_features=16, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=16, out_features=3072, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (lora_magnitude_vect

### Base model

In [None]:
# ONLY FOR base model evaluation
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    device_map="auto",
    trust_remote_code=True
)
model.eval()

# Test set evaluation

In [15]:
wandb.init(project="pubmed-pretrain-evaluation", name="attempt_5")

[34m[1mwandb[0m: [32m[41mERROR[0m Failed to detect the name of this notebook. You can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mhasindumadushan325[0m ([33mhasindumadushan325-university-of-peradeniya[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [16]:
training_args = TrainingArguments(
    output_dir="./eval_output",
    per_device_eval_batch_size=batch_size,
    do_eval=True,
    report_to="none",
    eval_accumulation_steps=2,
    label_names=["labels"]
)

In [17]:
trainer = Trainer(
    model=model,
    args=training_args,
    tokenizer=tokenizer,
    data_collator=DataCollatorForLanguageModeling(tokenizer, mlm=False),
    compute_metrics=lambda sample: compute_metrics_for_pretrain(sample, tokenizer)
)

  trainer = Trainer(


In [18]:
eval_result = trainer.evaluate(test_set)

In [19]:
# Print all results
print("\nEvaluation Metrics:")
print(f"Loss: {eval_result['eval_loss']:.4f}")
print(f"Perplexity: {torch.exp(torch.tensor(eval_result['eval_loss'])):.2f}")
print(f"BLEU: {eval_result['eval_bleu']:.4f}")
print(f"ROUGE-1: {eval_result['eval_rouge1']:.4f}")
print(f"ROUGE-2: {eval_result['eval_rouge2']:.4f}")
print(f"ROUGE-L: {eval_result['eval_rougeL']:.4f}")


Evaluation Metrics:
Loss: 3.5421
Perplexity: 34.54
BLEU: 0.0618
ROUGE-1: 0.3225
ROUGE-2: 0.1000
ROUGE-L: 0.2415


In [20]:
wandb.log({
    "eval_loss": eval_result['eval_loss'], 
    "perplexity": torch.exp(torch.tensor(eval_result['eval_loss'])),
    "BLUE": eval_result['eval_bleu'],
    "ROUGE_1": eval_result['eval_rouge1'],
    "ROUGE_2": eval_result['eval_rouge2'],
    "ROUGE_L": eval_result['eval_rougeL']    
})

# Inference

In [12]:
generated_text = generate(model, "Protein synthesis in postnuclear supernatants from mengovirus-infected Ehrlich ascites tumor cells.\n")
print(generated_text)

Protein synthesis in postnuclear supernatants from mengovirus-infected Ehrlich ascites tumor cells.
The synthesis of the tumor cell-specific serum albuminogen, a protein that is not synthesized by normal cells, was studied in postnuclear supernatants from mengovirus-infected Ehrlich ascites tumor cells. The synthesis of serum albuminogen was inhibited by the addition of a 10-fold excess of the tumor cell-specific protein, which was synthesized by the tumor cells. The synthesis of serum albuminogen was inhibited by the addition of a 10-fold excess of the tumor cell


In [None]:
wandb.log({"example_1": generated_text})