## Model Fine-tuning and Optimization

In [1]:
# Imports
import os
import re
import pandas as pd
import numpy as np
from functions import *

import torch
import gc
import pickle
from tqdm.auto import tqdm
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, Trainer, TrainingArguments
import bitsandbytes as bnb
from datasets import Dataset

# Load data
data = '../data/ready_for_phi-2/'

print(os.path.exists(data))
print(os.listdir(data))

from peft import prepare_model_for_kbit_training, LoraConfig, get_peft_model
import evaluate

True
['eval', 'test_01.csv', 'test_02.csv', 'train_01.csv', 'train_02.csv']


#### Loading

In [3]:
# Load
train10k = pd.read_csv(data + 'train_01.csv')
train5k = pd.read_csv(data + 'train_02.csv')

test2k = pd.read_csv(data + 'test_01.csv')
test1k = pd.read_csv(data + 'test_02.csv')

train10k_benchmark = pd.read_csv(data + 'eval/' + 'train10k_sample.csv')

# ===================== #
# Selecting only the columns we need for fine-tuning
# ===================== #

# 15k training samples
train10k = train10k[['label', 'prompt', 'response']]
train5k = train5k[['label', 'prompt', 'response']]

# 3k test samples
test2k = test2k[['label', 'prompt', 'response']]
test1k = test1k[['label', 'prompt', 'response']]

# Rename columns
train10k.columns = ['label', 'input', 'mistral_output']
train5k.columns = ['label', 'input', 'mistral_output']
test2k.columns = ['label', 'input', 'mistral_output']
test1k.columns = ['label', 'input', 'mistral_output']

train10k.shape, train5k.shape, test2k.shape, test1k.shape

((9999, 3), (4998, 3), (1998, 3), (999, 3))

In [4]:
train10k.columns

Index(['label', 'input', 'mistral_output'], dtype='object')

In [5]:
dfs = [train10k, train5k, test2k, test1k]

for df in dfs:
  df['mistral_output'] = df.apply(lambda row: clean_output(row['input'], row['mistral_output']), axis=1)

In [6]:
for df in dfs:
  df['input'] = df['input'].str.replace('[INST]', '')
  df['input'] = df['input'].str.replace('[/INST]', '')

In [7]:
train10k.head()

Unnamed: 0,label,input,mistral_output
0,2,A customer left us a 2-star review: 'the cash...,A customer left us a 2-star review: 'the cashi...
1,2,A customer left us a 2-star review: 'here's w...,A customer left us a 2-star review: 'here's wh...
2,2,A customer left us a 2-star review: 'went to ...,A customer left us a 2-star review: 'went to t...
3,2,A customer left us a 2-star review: 'went to ...,A customer left us a 2-star review: 'went to c...
4,2,A customer left us a 2-star review: 'a whole ...,A customer left us a 2-star review: 'a whole l...


In [8]:
# Saving
fine_tuning_data = '../data/for_step_4/'

if not os.path.exists(fine_tuning_data):
  os.makedirs(fine_tuning_data)

In [9]:
train10k.to_csv(fine_tuning_data + 'train10k.csv', index=False)
train5k.to_csv(fine_tuning_data + 'train5k.csv', index=False)
test2k.to_csv(fine_tuning_data + 'test2k.csv', index=False)
test1k.to_csv(fine_tuning_data + 'test1k.csv', index=False)

if fine_tuning_data + 'train10k.csv':
  print('File 1 saved successfully!')
if fine_tuning_data + 'train5k.csv':
  print('File 2 saved successfully!')
if fine_tuning_data + 'test2k.csv':
  print('File 3 saved successfully!')
if fine_tuning_data + 'test1k.csv':
  print('File 4 saved successfully!')

File 1 saved successfully!
File 2 saved successfully!
File 3 saved successfully!
File 4 saved successfully!


### Model

In [2]:

# Define 4-bit quantization config
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True
)

# Load model and tokenizer
model_name = "microsoft/phi-2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.padding_side = 'right'

model = AutoModelForCausalLM.from_pretrained(model_name,
                                             quantization_config=bnb_config,
                                             device_map={"": 0})

# Prepare model for kbit training
model = prepare_model_for_kbit_training(model)

# Define LoRA configuration
lora_config = LoraConfig(
    r=8,
    lora_alpha=16,
    # Target the attention modules in Phi-2
    target_modules=["q_proj", 
                    "k_proj", 
                    "v_proj", 
                    "out_proj",
                    "fc1",
                    "fc2"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

# Apply LoRA
model = get_peft_model(model, lora_config)

# Get model device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"==========\n! Model loaded on: {device}\n")
# print(model.quantization_method)
# print(model.config)

model.config.use_cache = False
model.gradient_checkpointing_enable()
model.enable_input_require_grads()

model.train()

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

! Model loaded on: cuda



PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): PhiForCausalLM(
      (model): PhiModel(
        (embed_tokens): Embedding(51200, 2560)
        (embed_dropout): Dropout(p=0.0, inplace=False)
        (layers): ModuleList(
          (0-31): 32 x PhiDecoderLayer(
            (self_attn): PhiSdpaAttention(
              (q_proj): lora.Linear4bit(
                (base_layer): Linear4bit(in_features=2560, out_features=2560, bias=True)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.05, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=2560, out_features=8, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=8, out_features=2560, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (lora_magnitude_vector): ModuleDic

In [3]:
gc.collect()
torch.cuda.empty_cache()

# Print GPU memory usage
print(f"GPU memory allocated: {torch.cuda.memory_allocated()/1e6:.2f} MB")
print(f"GPU memory cached: {torch.cuda.memory_reserved()/1e6:.2f} MB")

GPU memory allocated: 2392.86 MB
GPU memory cached: 2472.54 MB


In [4]:
# for name, _ in model.named_modules():
#   print(name)

In [5]:
if tokenizer.pad_token is None:
  tokenizer.pad_token = tokenizer.eos_token

In [6]:
def train_tokenize_function(df):
    combined = []
    for i in range(len(df["input"])):
        combined.append(
            "Instruction: " + df["input"][i] + "\nResponse: " + df["mistral_output"][i]
        )
    
    # Tokenize sequences
    tokenized = tokenizer(
        combined,
        max_length=2048,
        truncation=True,
        padding="max_length",
        return_tensors="pt"
    )
    
    labels = tokenized["input_ids"].clone()
   
    # Mask out the loss for the instruction part
    for i in range(len(combined)):
        instruction = "Instruction: " + df["input"][i] + "\nResponse: "
        instruction_tokens = tokenizer(instruction, return_tensors="pt")["input_ids"][0]
        instruction_length = len(instruction_tokens)
       
        # Set labels to -100 for the instruction part (no loss computed)
        labels[i, :instruction_length] = -100
   
    return {
        "input_ids": tokenized["input_ids"],
        "attention_mask": tokenized["attention_mask"],
        "labels": labels
    }

In [7]:
def test_tokenize_function(df):
    # Tokenize the prompt (input)
    tokenized_inputs = tokenizer(
        df["input"],
        max_length=2048,
        truncation=True,
        padding="max_length",
        return_tensors="pt"
    )
    
    # Tokenize the gold responses (mistral_output)
    tokenized_gold = tokenizer(
        df["mistral_output"],
        max_length=2048,
        truncation=True,
        padding="max_length",
        return_tensors="pt"
    )
    
    # Use the gold tokenized outputs as labels for metric computation
    tokenized_inputs["labels"] = tokenized_gold["input_ids"]
    
    return tokenized_inputs

In [8]:
# Saving
fine_tuning_data = '../data/for_step_4/'

# Load
train10k = pd.read_csv(fine_tuning_data + 'train10k.csv')
#train5k = pd.read_csv(fine_tuning_data + 'train5k.csv')
test2k = pd.read_csv(fine_tuning_data + 'test2k.csv')
#test1k = pd.read_csv(fine_tuning_data + 'test1k.csv')


# Time constraints. I'll just use 999 train, 333 test samples
from sklearn.model_selection import train_test_split
train_left, train = train_test_split(train10k, 
                         test_size=999, 
                         random_state=42,
                         stratify=train10k['label'])
train_left, test = train_test_split(test2k, 
                        test_size=333, 
                        random_state=42,
                        stratify=test2k['label'])

# train.shape, test.shape, train['label'].value_counts(), test['label'].value_counts()

# Selecting only the columns we need for fine-tuning, I was splitting by label.
train = train[['input', 'mistral_output']]
test = test[['input', 'mistral_output']]

# Convert to HuggingFace Datasets
train_dataset = Dataset.from_pandas(train)
train_dataset = train_dataset.remove_columns(['__index_level_0__'])

test_dataset = Dataset.from_pandas(test)
test_dataset = test_dataset.remove_columns(['__index_level_0__'])

# Tokenize
tokenized_train_dataset = train_dataset.map(train_tokenize_function,
                                            batched=True)
tokenized_test_dataset = test_dataset.map(test_tokenize_function,
                                          batched=True)

Map:   0%|          | 0/999 [00:00<?, ? examples/s]

Map:   0%|          | 0/333 [00:00<?, ? examples/s]

In [9]:
"""
# Selecting only the columns we need for fine-tuning, I was splitting by label.
train10k = train10k[['input', 'mistral_output']]
train5k = train5k[['input', 'mistral_output']]
test2k = test2k[['input']]
test1k = test1k[['input']]

# Convert to HuggingFace Datasets
from datasets import Dataset

train = Dataset.from_pandas(train)
test = Dataset.from_pandas(test)

train10k_dataset = Dataset.from_pandas(train10k)
train5k_dataset = Dataset.from_pandas(train5k)
test2k_dataset = Dataset.from_pandas(test2k)
test1k_dataset = Dataset.from_pandas(test1k)

# Tokenize
train = train.map(train_tokenize_function,
                  batched=True)
test = test.map(test_tokenize_function, 
                  batched=True)
"""

"\n# Selecting only the columns we need for fine-tuning, I was splitting by label.\ntrain10k = train10k[['input', 'mistral_output']]\ntrain5k = train5k[['input', 'mistral_output']]\ntest2k = test2k[['input']]\ntest1k = test1k[['input']]\n\n# Convert to HuggingFace Datasets\nfrom datasets import Dataset\n\ntrain = Dataset.from_pandas(train)\ntest = Dataset.from_pandas(test)\n\ntrain10k_dataset = Dataset.from_pandas(train10k)\ntrain5k_dataset = Dataset.from_pandas(train5k)\ntest2k_dataset = Dataset.from_pandas(test2k)\ntest1k_dataset = Dataset.from_pandas(test1k)\n\n# Tokenize\ntrain = train.map(train_tokenize_function,\n                  batched=True)\ntest = test.map(test_tokenize_function, \n                  batched=True)\n"

In [10]:
# train_dataset, test_dataset

In [11]:
# tokenized_train_dataset, tokenized_test_dataset

In [12]:
#print(tokenizer.decode(tokenized_train_dataset['input_ids'][0]))
#print('\n\n')

#print(tokenizer.decode(tokenized_test_dataset['input_ids'][0]))
#print(tokenizer.decode(tokenized_test_dataset['labels'][0]))

In [13]:
# Model output
model_path = '../models/'

if not os.path.exists(model_path):
  os.makedirs(model_path)

#### Evaluate

In [14]:
import logging
import evaluate

# Load evaluation metrics
bleurt = evaluate.load("bleurt", trust_remote_code=True)
bertscore = evaluate.load("bertscore")
meteor = evaluate.load("meteor")

def compute_metrics(eval_preds):
    """
    Computes BLEURT, BERTScore (F1), and METEOR for evaluating generated feedback.

    Args:
        eval_preds (tuple): A tuple containing (predictions, references).

    Returns:
        dict: A dictionary with metric scores.
    """
    predictions, references = eval_preds

    # Ensure string format
    predictions = [str(pred).strip() for pred in predictions]
    references = [str(ref).strip() for ref in references]

    # Compute BLEURT
    bleurt_scores = bleurt.compute(predictions=predictions, references=references)["scores"]

    # Compute BERTScore(F1)
    bertscore_f1 = bertscore.compute(predictions=predictions, references=references, lang="en")["f1"]

    # Compute METEOR
    meteor_score = meteor.compute(predictions=predictions, references=references)["meteor"]

    metrics = {
        "bleurt": np.mean(bleurt_scores),
        "bertscore_f1": np.mean(bertscore_f1),
        "meteor": meteor_score
    }

    logger.info(f"Eval Metrics: {metrics}")  # Force logging

    return metrics

Using default BLEURT-Base checkpoint for sequence maximum length 128. You can use a bigger model for better results with e.g.: evaluate.load('bleurt', 'bleurt-large-512').


INFO:tensorflow:Reading checkpoint C:\Users\kadm2\.cache\huggingface\metrics\bleurt\default\downloads\extracted\1ed47b7280a9b4162e745e8e509d21a5ca48976269d7d45c0c7b6c6e350c760e\bleurt-base-128.
INFO:tensorflow:Config file found, reading.
INFO:tensorflow:Will load checkpoint bert_custom
INFO:tensorflow:Loads full paths and checks that files exists.
INFO:tensorflow:... name:bert_custom
INFO:tensorflow:... vocab_file:vocab.txt
INFO:tensorflow:... bert_config_file:bert_config.json
INFO:tensorflow:... do_lower_case:True
INFO:tensorflow:... max_seq_length:128
INFO:tensorflow:Creating BLEURT scorer.
INFO:tensorflow:Creating WordPiece tokenizer.
INFO:tensorflow:WordPiece tokenizer instantiated.
INFO:tensorflow:Creating Eager Mode predictor.
INFO:tensorflow:Loading model.
INFO:tensorflow:BLEURT initialized.


[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\kadm2\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\kadm2\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\kadm2\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


## Training

In [18]:
gc.collect()
torch.cuda.empty_cache()

In [20]:
training_args = TrainingArguments(
  output_dir=model_path,
  do_train=True,
  do_eval=True,
  do_predict=True,
  
  eval_strategy="epoch",
  save_strategy="epoch",
  eval_accumulation_steps=8,
  logging_strategy="epoch",
  num_train_epochs=3,
  
  learning_rate=2e-5,
  lr_scheduler_type="cosine",
  warmup_ratio=0.1,
  weight_decay=0.01,
  
  fp16=True,
  dataloader_num_workers=4,
  
  optim="adamw_torch_fused",
  load_best_model_at_end=True,
  metric_for_best_model="meteor",
  greater_is_better=True,
  
  report_to="tensorboard",
  resume_from_checkpoint=True,
  
  per_device_eval_batch_size=2,
  per_device_train_batch_size=2,
  gradient_accumulation_steps=2,
  gradient_checkpointing=True,
  save_total_limit=3
)

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_test_dataset,
    compute_metrics=compute_metrics,
)

trainer.train()
trainer.save_model(os.path.join(model_path, 'phi-2_01'))

In [None]:
gc.collect()
torch.cuda.empty_cache()

## Fine-tuned Model

In [3]:
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import PeftModel
import torch

base_model_name = "microsoft/phi-2"  # Your original model
adapter_path = "../models/phi-2_01"

# Load base model
base_model = AutoModelForCausalLM.from_pretrained(base_model_name, torch_dtype=torch.float16)
tokenizer = AutoTokenizer.from_pretrained(base_model_name)

# Load LoRA adapter and merge
model = PeftModel.from_pretrained(base_model, adapter_path)
model = model.merge_and_unload()  # Merge LoRA with base model

# Move to GPU if available
model.to("cuda" if torch.cuda.is_available() else "cpu")
model.eval()

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

PhiForCausalLM(
  (model): PhiModel(
    (embed_tokens): Embedding(51200, 2560)
    (embed_dropout): Dropout(p=0.0, inplace=False)
    (layers): ModuleList(
      (0-31): 32 x PhiDecoderLayer(
        (self_attn): PhiSdpaAttention(
          (q_proj): Linear(in_features=2560, out_features=2560, bias=True)
          (k_proj): Linear(in_features=2560, out_features=2560, bias=True)
          (v_proj): Linear(in_features=2560, out_features=2560, bias=True)
          (dense): Linear(in_features=2560, out_features=2560, bias=True)
          (rotary_emb): PhiRotaryEmbedding()
        )
        (mlp): PhiMLP(
          (activation_fn): NewGELUActivation()
          (fc1): Linear(in_features=2560, out_features=10240, bias=True)
          (fc2): Linear(in_features=10240, out_features=2560, bias=True)
        )
        (input_layernorm): LayerNorm((2560,), eps=1e-05, elementwise_affine=True)
        (resid_dropout): Dropout(p=0.1, inplace=False)
      )
    )
    (final_layernorm): LayerNorm((256

In [2]:
merged_model_path = "../models/phi-2_full"
#model.save_pretrained(merged_model_path)
#tokenizer.save_pretrained(merged_model_path)

In [5]:
gc.collect()
torch.cuda.empty_cache()

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(merged_model_path)

# Load the model
model = AutoModelForCausalLM.from_pretrained(
    merged_model_path,
    torch_dtype=torch.bfloat16,
    device_map="auto"
)

model.eval()

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Some parameters are on the meta device because they were offloaded to the cpu.


PhiForCausalLM(
  (model): PhiModel(
    (embed_tokens): Embedding(51200, 2560)
    (embed_dropout): Dropout(p=0.0, inplace=False)
    (layers): ModuleList(
      (0-31): 32 x PhiDecoderLayer(
        (self_attn): PhiSdpaAttention(
          (q_proj): Linear(in_features=2560, out_features=2560, bias=True)
          (k_proj): Linear(in_features=2560, out_features=2560, bias=True)
          (v_proj): Linear(in_features=2560, out_features=2560, bias=True)
          (dense): Linear(in_features=2560, out_features=2560, bias=True)
          (rotary_emb): PhiRotaryEmbedding()
        )
        (mlp): PhiMLP(
          (activation_fn): NewGELUActivation()
          (fc1): Linear(in_features=2560, out_features=10240, bias=True)
          (fc2): Linear(in_features=10240, out_features=2560, bias=True)
        )
        (input_layernorm): LayerNorm((2560,), eps=1e-05, elementwise_affine=True)
        (resid_dropout): Dropout(p=0.1, inplace=False)
      )
    )
    (final_layernorm): LayerNorm((256

In [53]:
# Check for trainable parameters in the model
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
total_params = sum(p.numel() for p in model.parameters())

print(f"Trainable parameters: {trainable_params}")
print(f"Total parameters: {total_params}")

Trainable parameters: 2779683840
Total parameters: 2779683840


In [54]:
train10k['input'][0]

"[INST] A customer left us a 2-star review: 'the cashier told my husband that should he get change and the air machine just don't work don't come back in asking for a refund. this location the employees are uneducated on any kind of hospitality when having contact with human customers. i just don't care for employees who have a rude attitude. how do store owners proudly have a rude person as a cashier? well let's see how that will work out for them. attention all friends the people are rude at this location.' The customer feels annoyance, anger, neutral. Concisely, how can we best improve our services for this customer's experience? [/INST]"

In [56]:
prompt = train10k['input'][1]
inputs = tokenizer(prompt, return_tensors="pt").to("cuda")

with torch.no_grad():  # No need to track gradients during inference
    output = model.generate(**inputs, 
                            max_new_tokens=500,
                            no_repeat_ngram_size=3,
                            do_sample=True,
                            pad_token_id=tokenizer.eos_token_id,
                            temperature=0.2,
                            top_p=0.9,
                            top_k=50) 

print(tokenizer.decode(output[0], skip_special_tokens=True))  # Decode and print the result

[INST] A customer left us a 2-star review: 'here's what happened n we were suppose to be at the regular mgm.. so we pulled up, put our car in valet, got a bell boy to take our stuff.. and then we realized they gave us rooms at the signature. what a pain!!!! n then when we tried to check in we were suppose to have 2 rooms... but they couldn't find our other reservation because we booked through an online travel company so they only gave us one room!!! n the next day we had to spend at least an hour trying to get compensated for the room that we pre paid for! finally they gave us back the money and also said they would give us 200 to spend on food and bev. n so, we ordered 158 in food that night... and tipped 30! n then when i went to check out... they had all the room service charges and all the room charges on my card!!! it was such a hassle to get them off!!! in the end, they took off all charges but the 30 tip which i was still mad about. n nthe rooms were super nice! they had a nice

In [57]:
print(tokenizer.encode(prompt))
print(tokenizer.decode(tokenizer.encode(prompt)))

[58, 38604, 60, 317, 6491, 1364, 514, 257, 362, 12, 7364, 2423, 25, 705, 1456, 338, 644, 3022, 299, 356, 547, 11691, 284, 307, 379, 262, 3218, 10527, 76, 492, 523, 356, 5954, 510, 11, 1234, 674, 1097, 287, 1188, 316, 11, 1392, 257, 8966, 2933, 284, 1011, 674, 3404, 492, 290, 788, 356, 6939, 484, 2921, 514, 9519, 379, 262, 9877, 13, 644, 257, 2356, 13896, 299, 788, 618, 356, 3088, 284, 2198, 287, 356, 547, 11691, 284, 423, 362, 9519, 986, 475, 484, 3521, 470, 1064, 674, 584, 24048, 780, 356, 21765, 832, 281, 2691, 3067, 1664, 523, 484, 691, 2921, 514, 530, 2119, 10185, 299, 262, 1306, 1110, 356, 550, 284, 4341, 379, 1551, 281, 1711, 2111, 284, 651, 34304, 329, 262, 2119, 326, 356, 662, 3432, 329, 0, 3443, 484, 2921, 514, 736, 262, 1637, 290, 635, 531, 484, 561, 1577, 514, 939, 284, 4341, 319, 2057, 290, 307, 85, 13, 299, 523, 11, 356, 6149, 24063, 287, 2057, 326, 1755, 986, 290, 28395, 1542, 0, 299, 788, 618, 1312, 1816, 284, 2198, 503, 986, 484, 550, 477, 262, 2119, 2139, 4530, 290, 47