##### **Installing dependencies**

In [1]:
!pip install ipython-autotime gdown evaluate accelerate bitsandbytes peft loralib huggingface_hub transformers peft

Collecting ipython-autotime
  Downloading ipython_autotime-0.3.2-py2.py3-none-any.whl.metadata (1.4 kB)
Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting bitsandbytes
  Downloading bitsandbytes-0.45.0-py3-none-manylinux_2_24_x86_64.whl.metadata (2.9 kB)
Collecting peft
  Downloading peft-0.14.0-py3-none-any.whl.metadata (13 kB)
Collecting loralib
  Downloading loralib-0.1.2-py3-none-any.whl.metadata (15 kB)
Collecting huggingface_hub
  Downloading huggingface_hub-0.27.1-py3-none-any.whl.metadata (13 kB)
Downloading ipython_autotime-0.3.2-py2.py3-none-any.whl (7.0 kB)
Downloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading bitsandbytes-0.45.0-py3-none-manylinux_2_24_x86_64.whl (69.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m69.1/69.1 MB[0m [31m25.0 MB/s[0m eta [36m0:00:00[0m:00:01[0m

##### **Importing dependencies**

In [2]:
%load_ext autotime
import pandas as pd
import numpy as np
import nltk
import os
import zipfile
import tarfile
import re
import gdown
import gzip
import shutil
import wandb
import time
import torch
import psutil
import gc

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report, precision_recall_fscore_support
from datasets import Dataset

from transformers import (
    AutoTokenizer,
    AutoModelForSeq2SeqLM,
    AutoModelForCausalLM,
    AutoModelForSequenceClassification,
    DistilBertTokenizerFast,
    DistilBertForSequenceClassification,
    RobertaTokenizerFast, 
    RobertaForSequenceClassification,
    GPT2TokenizerFast, 
    GPT2ForSequenceClassification,
    GenerationConfig,
    TrainingArguments,
    Trainer,
    pipeline,
    BitsAndBytesConfig,
    DataCollatorForSeq2Seq,
    DataCollatorWithPadding,
    AdamW,
    get_scheduler,
    GPT2Config
)
import torch
from torch.utils.data import DataLoader, Dataset
from tqdm import tqdm
import time
import evaluate
from peft import (
    LoraConfig,
    get_peft_model,
    TaskType,
    PeftModel,
    PeftConfig,
)
from huggingface_hub import login
import kagglehub

# from nltk.corpus import stopwords
# from nltk import word_tokenize
# from nltk.stem import WordNetLemmatizer
# from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
# from sklearn.metrics import accuracy_score
# from sklearn.naive_bayes import MultinomialNB
# from sklearn.linear_model import LogisticRegression
# from sklearn.neighbors import KNeighborsClassifier
# from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
# from google.colab import files
# from scipy.sparse import hstack
# from gensim.models import Word2Vec

import warnings

# Suppress specific warnings
warnings.filterwarnings("ignore", message=".*clean_up_tokenization_spaces.*")
warnings.filterwarnings("ignore", message="Some weights of DistilBertForSequenceClassification were not initialized.*")
warnings.filterwarnings("ignore", message=".*evaluation_strategy.*")
warnings.filterwarnings("ignore", message=".*gather along dimension 0.*")
warnings.filterwarnings("ignore", message=".*GradScaler.*")

time: 29.8 s (started: 2025-01-06 18:38:31 +00:00)


In [3]:
# Disable wandb Logging
os.environ["WANDB_MODE"] = "disabled"
wandb.init()

# Check for GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda
time: 5.73 s (started: 2025-01-06 18:39:05 +00:00)


In [5]:
# import torch
# torch.cuda.empty_cache()

time: 377 µs (started: 2025-01-06 18:39:34 +00:00)


##### **Supporting functions**

In [6]:
def clean_review(review):
    review = re.sub(r'<.*?>', '', review)
    review = re.sub(r'http\S+|www\S+|https\S+', '', review, flags=re.MULTILINE)
    review = review.strip()
    return review

# def preprocess_function(examples):
#     inputs = tokenizer(examples["review"], truncation=True, padding=True, max_length=512)
#     inputs["labels"] = [1 if label.lower() == "positive" else 0 for label in examples["sentiment"]]
#     return inputs

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    acc = accuracy_score(labels, preds)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average="weighted")
    return {"accuracy": acc, "precision": precision, "recall": recall, "f1": f1}

time: 560 µs (started: 2025-01-06 18:39:40 +00:00)


##### **Loading data**

In [7]:
train_df_full = pd.read_csv("/kaggle/input/imdb-dataset-gpt2-3/train.csv")
train_df = train_df_full.sample(n=3000, random_state=42)
train_df['review'] = train_df['review'].apply(clean_review)
train_df.reset_index(drop=True, inplace=True)

time: 970 ms (started: 2025-01-06 18:39:46 +00:00)


In [8]:
test_df_full = pd.read_csv("/kaggle/input/imdb-dataset-gpt2-3/test.csv")
test_df = test_df_full.sample(n=2000, random_state=42)
test_df['review'] = test_df['review'].apply(clean_review)
test_df.reset_index(drop=True, inplace=True)

time: 596 ms (started: 2025-01-06 18:39:47 +00:00)


In [9]:
from datasets import Dataset

train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)

time: 73.9 ms (started: 2025-01-06 18:39:48 +00:00)


### **Experimentations for GPT2 - Phase 1:** keeping LoRA hyperparams fixed

In [8]:
model_checkpoint = "gpt2"
configuration = GPT2Config()
tokenizer = GPT2TokenizerFast.from_pretrained(model_checkpoint)
tokenizer.pad_token = tokenizer.eos_token

def preprocess_function(examples):
    inputs = tokenizer(examples["review"], truncation=True, padding='max_length')
    inputs["labels"] = [1 if label.lower() == "positive" else 0 for label in examples["sentiment"]]
    return inputs

tokenized_train = train_dataset.map(preprocess_function, batched=True)
tokenized_test = test_dataset.map(preprocess_function, batched=True)

model = GPT2ForSequenceClassification(configuration).from_pretrained(model_checkpoint, num_labels=2).to(device)
model.config.pad_token_id = model.config.eos_token_id

# Fixed LoRA parameters
rank = 8 
target_matrices = ["attn.c_attn", "attn.c_proj"]
lora_alpha = 16
lora_dropout = 0.1

# Changing hyperparams for batch size, epochs and learning rates
batch_sizes = [16, 32]
epochs_list = [3, 5]
learning_rates = [3e-5, 1e-4]

training_dropout = 0.1 # Fixed

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

Map:   0%|          | 0/3000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


time: 11.1 s (started: 2025-01-06 02:33:48 +00:00)


In [9]:
print(f"Model is running on device: {model.device}")

Model is running on device: cuda:0
time: 580 µs (started: 2025-01-06 02:34:02 +00:00)


In [10]:
results_phase_1 = []

for batch_size in batch_sizes:
    for epochs in epochs_list:
        for learning_rate in learning_rates:
            lora_config = LoraConfig(
                r=rank,
                lora_alpha=lora_alpha,
                target_modules=target_matrices,
                lora_dropout=lora_dropout,
                task_type="SEQ_CLS"
            )

            model_with_lora = get_peft_model(model, lora_config)
            
            start_time = time.time()
            print(f"\nRunning experiment with: Batch Size: {batch_size}, Epochs: {epochs}, Learning Rate: {learning_rate}")

            num_parameters = sum(p.numel() for p in model_with_lora.parameters())
            trainable_parameters = sum(p.numel() for p in model_with_lora.parameters() if p.requires_grad)
            trainable_percentage = (trainable_parameters / num_parameters) * 100
            
            print(f"Model has {num_parameters:,} total parameters")
            print(f"Model has {trainable_parameters:,} trainable parameters")
            print(f"{trainable_percentage:.2f}% of the parameters are trainable")

            if torch.cuda.is_available():
                torch.cuda.empty_cache()
                gpu_memory = torch.cuda.memory_allocated() / 1024**2  # in MB
                print(f"GPU memory allocated: {gpu_memory:.2f} MB")

            wandb.config.update({"model/num_parameters": model.num_parameters()}, allow_val_change=True)

            output_dir = f"./results_phase1_r{rank}_alpha{lora_alpha}_drop{lora_dropout}_targets{'_'.join(target_matrices)}_bs{batch_size}_epochs{epochs}_lr{learning_rate}"
            training_args = TrainingArguments(
                output_dir=output_dir,
                evaluation_strategy="epoch",
                learning_rate=learning_rate,
                per_device_train_batch_size=8,
                per_device_eval_batch_size=batch_size,
                num_train_epochs=epochs,
                weight_decay=0.01,
                save_total_limit=1,
                save_strategy="epoch",
                logging_dir="./logs",
                logging_steps=500,
                fp16=True,
                load_best_model_at_end=True,
            )

            trainer = Trainer(
                model=model_with_lora,
                args=training_args,
                train_dataset=tokenized_train,
                eval_dataset=tokenized_test,
                tokenizer=tokenizer,
                compute_metrics=compute_metrics
            )

            trainer.train()
            metrics = trainer.evaluate()

            end_time = time.time()
            elapsed_time = end_time - start_time
            print(f"Training time: {elapsed_time:.2f} seconds")

            results_phase_1.append({
                "Model": "GPT2",
                "Batch Size": batch_size,
                "Epochs": epochs,
                "Learning Rate": learning_rate,
                "Rank": rank,
                "Alpha": lora_alpha,
                "LoRA Dropout": lora_dropout,
                "Target Matrices": target_matrices,
                "Accuracy": metrics["eval_accuracy"],
                "Precision": metrics["eval_precision"],
                "Recall": metrics["eval_recall"],
                "F1-Score": metrics["eval_f1"]                
            })

  self.scaler = torch.cuda.amp.GradScaler(**kwargs)



Running experiment with: Batch Size: 16, Epochs: 3, Learning Rate: 3e-05
Model has 124,885,248 total parameters
Model has 443,904 trainable parameters
0.36% of the parameters are trainable
GPU memory allocated: 489.17 MB




Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,No log,0.645717,0.6465,0.66565,0.6465,0.639822
2,No log,0.598039,0.7145,0.72713,0.7145,0.712179
3,0.786600,0.574032,0.7345,0.735706,0.7345,0.734547


Training time: 1467.19 seconds

Running experiment with: Batch Size: 16, Epochs: 3, Learning Rate: 0.0001
Model has 124,885,248 total parameters
Model has 443,904 trainable parameters
0.36% of the parameters are trainable
GPU memory allocated: 510.50 MB


  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,No log,0.397508,0.837,0.837968,0.837,0.837041
2,No log,0.294944,0.884,0.888091,0.884,0.883915
3,0.452900,0.266308,0.896,0.896578,0.896,0.89603


Training time: 1474.90 seconds

Running experiment with: Batch Size: 16, Epochs: 5, Learning Rate: 3e-05
Model has 124,885,248 total parameters
Model has 443,904 trainable parameters
0.36% of the parameters are trainable
GPU memory allocated: 510.50 MB


  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,No log,0.64104,0.658,0.671007,0.658,0.654226
2,No log,0.57317,0.7305,0.740916,0.7305,0.728911
3,0.767200,0.488591,0.7865,0.795655,0.7865,0.785644
4,0.767200,0.419876,0.822,0.822422,0.822,0.82205
5,0.767200,0.401566,0.835,0.836041,0.835,0.83504


Training time: 2393.38 seconds

Running experiment with: Batch Size: 16, Epochs: 5, Learning Rate: 0.0001
Model has 124,885,248 total parameters
Model has 443,904 trainable parameters
0.36% of the parameters are trainable
GPU memory allocated: 510.50 MB


  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,No log,0.348961,0.8555,0.857991,0.8555,0.855475
2,No log,0.303687,0.892,0.897177,0.892,0.891875
3,0.436600,0.24474,0.903,0.903462,0.903,0.903028
4,0.436600,0.267985,0.899,0.901834,0.899,0.898976
5,0.436600,0.245444,0.9045,0.905048,0.9045,0.904528


Training time: 2381.80 seconds

Running experiment with: Batch Size: 32, Epochs: 3, Learning Rate: 3e-05
Model has 124,885,248 total parameters
Model has 443,904 trainable parameters
0.36% of the parameters are trainable
GPU memory allocated: 510.50 MB


  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,No log,0.646691,0.6445,0.666785,0.6445,0.636406
2,No log,0.598885,0.71,0.725103,0.71,0.706985
3,0.786900,0.573231,0.736,0.738233,0.736,0.735937


Training time: 1464.66 seconds

Running experiment with: Batch Size: 32, Epochs: 3, Learning Rate: 0.0001
Model has 124,885,248 total parameters
Model has 443,904 trainable parameters
0.36% of the parameters are trainable
GPU memory allocated: 510.50 MB


  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,No log,0.397508,0.837,0.837968,0.837,0.837041
2,No log,0.294945,0.884,0.888091,0.884,0.883915
3,0.452900,0.266308,0.896,0.896578,0.896,0.89603


Training time: 1464.61 seconds

Running experiment with: Batch Size: 32, Epochs: 5, Learning Rate: 3e-05
Model has 124,885,248 total parameters
Model has 443,904 trainable parameters
0.36% of the parameters are trainable
GPU memory allocated: 510.50 MB


  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,No log,0.64104,0.658,0.671007,0.658,0.654226
2,No log,0.57317,0.7305,0.740916,0.7305,0.728911
3,0.767200,0.488591,0.7865,0.795655,0.7865,0.785644
4,0.767200,0.419876,0.822,0.822422,0.822,0.82205
5,0.767200,0.401566,0.835,0.836041,0.835,0.83504


Training time: 2380.87 seconds

Running experiment with: Batch Size: 32, Epochs: 5, Learning Rate: 0.0001
Model has 124,885,248 total parameters
Model has 443,904 trainable parameters
0.36% of the parameters are trainable
GPU memory allocated: 510.50 MB


  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,No log,0.348961,0.8555,0.857991,0.8555,0.855475
2,No log,0.303689,0.892,0.897177,0.892,0.891875
3,0.436600,0.24474,0.903,0.903462,0.903,0.903028
4,0.436600,0.267985,0.899,0.901834,0.899,0.898976
5,0.436600,0.245443,0.9045,0.905048,0.9045,0.904528


Training time: 2380.51 seconds
time: 4h 16min 48s (started: 2025-01-06 02:34:05 +00:00)


In [11]:
# Testing evaluations saved
results_df_phase_1 = pd.DataFrame(results_phase_1)
results_df_phase_1.to_csv("8_FT_GPT2_Experiments_FixedLoRA.csv", index=False)

time: 7 ms (started: 2025-01-06 06:51:07 +00:00)


### **Experimentations for GPT2 - Phase 2:** changing LoRA hyperparameters

In [10]:
model_checkpoint = "gpt2"
configuration = GPT2Config()
tokenizer = GPT2TokenizerFast.from_pretrained(model_checkpoint)
tokenizer.pad_token = tokenizer.eos_token

def preprocess_function(examples):
    inputs = tokenizer(examples["review"], truncation=True, padding='max_length')
    inputs["labels"] = [1 if label.lower() == "positive" else 0 for label in examples["sentiment"]]
    return inputs

tokenized_train = train_dataset.map(preprocess_function, batched=True)
tokenized_test = test_dataset.map(preprocess_function, batched=True)

model = GPT2ForSequenceClassification(configuration).from_pretrained(model_checkpoint, num_labels=2).to(device)
model.config.pad_token_id = model.config.eos_token_id

# Fixed parameters for batch size and epochs, etc
fixed_batch_size = 16
fixed_epochs = 3
fixed_learning_rate = 1e-4
training_dropout = 0.1

# LoRA parameter combinations
ranks = [8, 16]
target_matrices_list = [["attn.c_attn"], ["attn.c_proj"], ["attn.c_attn", "attn.c_proj"]]
lora_alpha = 16
lora_dropouts = [0.1, 0.2]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

Map:   0%|          | 0/3000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


time: 10.9 s (started: 2025-01-06 18:40:04 +00:00)


In [11]:
results_phase_2 = []

for rank in ranks:
    for target_matrices in target_matrices_list:
        for lora_dropout in lora_dropouts:
            lora_config = LoraConfig(
                r=rank,
                lora_alpha=lora_alpha,  # Fixed lora_alpha
                target_modules=target_matrices,
                lora_dropout=lora_dropout,
                task_type="SEQ_CLS"
            )

            model_with_lora = get_peft_model(model, lora_config)

            start_time = time.time()
            print(f"\nRunning experiment with: Rank: {rank}, Target Matrices: {target_matrices}, LoRA Dropout: {lora_dropout}")

            num_parameters = sum(p.numel() for p in model_with_lora.parameters())
            trainable_parameters = sum(p.numel() for p in model_with_lora.parameters() if p.requires_grad)
            trainable_percentage = (trainable_parameters / num_parameters) * 100
            
            print(f"Model has {num_parameters:,} total parameters")
            print(f"Model has {trainable_parameters:,} trainable parameters")
            print(f"{trainable_percentage:.2f}% of the parameters are trainable")

            if torch.cuda.is_available():
                torch.cuda.empty_cache()
                gpu_memory = torch.cuda.memory_allocated() / 1024**2  # in MB
                print(f"GPU memory allocated: {gpu_memory:.2f} MB")

            wandb.config.update({"model/num_parameters": model.num_parameters()}, allow_val_change=True)

            output_dir = f"./results_phase2_r{rank}_alpha{lora_alpha}_drop{lora_dropout}_targets{'_'.join(target_matrices)}_bs{fixed_batch_size}_epochs{fixed_epochs}_lr{fixed_learning_rate}"
            training_args = TrainingArguments(
                output_dir=output_dir,
                evaluation_strategy="epoch",
                learning_rate=fixed_learning_rate,
                per_device_train_batch_size=8,
                per_device_eval_batch_size=fixed_batch_size,
                num_train_epochs=fixed_epochs,
                weight_decay=0.01,
                save_total_limit=1,
                save_strategy="epoch",
                logging_dir="./logs",
                logging_steps=500,
                fp16=True,
                load_best_model_at_end=True,
            )

            trainer = Trainer(
                model=model_with_lora,
                args=training_args,
                train_dataset=tokenized_train,
                eval_dataset=tokenized_test,
                tokenizer=tokenizer,
                compute_metrics=compute_metrics
            )

            trainer.train()
            metrics = trainer.evaluate()

            end_time = time.time()
            elapsed_time = end_time - start_time
            print(f"Training time: {elapsed_time:.2f} seconds")

            results_phase_2.append({
                "Model": "GPT2",
                "Batch Size": fixed_batch_size,
                "Epochs": fixed_epochs,
                "Learning Rate": fixed_learning_rate,
                "Rank": rank,
                "Alpha": lora_alpha,  # Fixed alpha
                "LoRA Dropout": lora_dropout,
                "Target Matrices": target_matrices,
                "Accuracy": metrics["eval_accuracy"],
                "Precision": metrics["eval_precision"],
                "Recall": metrics["eval_recall"],
                "F1-Score": metrics["eval_f1"]
            })




Running experiment with: Rank: 8, Target Matrices: ['attn.c_attn'], LoRA Dropout: 0.1
Model has 124,737,792 total parameters
Model has 296,448 trainable parameters
0.24% of the parameters are trainable
GPU memory allocated: 488.61 MB




Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,No log,0.504908,0.752,0.763523,0.752,0.747856
2,No log,0.330734,0.861,0.863449,0.861,0.860979
3,0.500500,0.305279,0.8775,0.877489,0.8775,0.877484


Training time: 1417.63 seconds

Running experiment with: Rank: 8, Target Matrices: ['attn.c_attn'], LoRA Dropout: 0.2
Model has 124,737,792 total parameters
Model has 296,448 trainable parameters
0.24% of the parameters are trainable
GPU memory allocated: 508.25 MB


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,No log,0.498833,0.7555,0.76508,0.7555,0.752014
2,No log,0.318343,0.8685,0.87273,0.8685,0.86839
3,0.491800,0.296243,0.8825,0.882547,0.8825,0.882513


Training time: 1419.29 seconds

Running experiment with: Rank: 8, Target Matrices: ['attn.c_proj'], LoRA Dropout: 0.1
Model has 124,885,248 total parameters
Model has 443,904 trainable parameters
0.36% of the parameters are trainable




GPU memory allocated: 507.69 MB


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,No log,0.269506,0.895,0.896166,0.895,0.895024
2,No log,0.263275,0.901,0.902545,0.901,0.901014
3,0.267700,0.256167,0.9015,0.902241,0.9015,0.901528


Training time: 1444.22 seconds

Running experiment with: Rank: 8, Target Matrices: ['attn.c_proj'], LoRA Dropout: 0.2
Model has 124,885,248 total parameters
Model has 443,904 trainable parameters
0.36% of the parameters are trainable
GPU memory allocated: 509.37 MB


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,No log,0.260008,0.904,0.905263,0.904,0.90402
2,No log,0.268899,0.9015,0.904276,0.9015,0.901479
3,0.235700,0.259359,0.906,0.906518,0.906,0.906027


Training time: 1443.10 seconds

Running experiment with: Rank: 8, Target Matrices: ['attn.c_attn', 'attn.c_proj'], LoRA Dropout: 0.1
Model has 124,885,248 total parameters
Model has 443,904 trainable parameters
0.36% of the parameters are trainable
GPU memory allocated: 510.50 MB


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,No log,0.432218,0.806,0.81344,0.806,0.80413
2,No log,0.292086,0.8765,0.878059,0.8765,0.876516
3,0.451100,0.276766,0.8925,0.892515,0.8925,0.892505


Training time: 1443.20 seconds

Running experiment with: Rank: 8, Target Matrices: ['attn.c_attn', 'attn.c_proj'], LoRA Dropout: 0.2
Model has 124,885,248 total parameters
Model has 443,904 trainable parameters
0.36% of the parameters are trainable
GPU memory allocated: 510.50 MB


Epoch,Training Loss,Validation Loss


KeyboardInterrupt: 

time: 2h 2min 25s (started: 2025-01-06 18:40:21 +00:00)


In [12]:
# Testing evaluations saved
results_df_phase_2 = pd.DataFrame(results_phase_2)
results_df_phase_2.to_csv("8_FT_GPT2_Experiments_FixedTrainingHyp.csv", index=False)

time: 12.1 ms (started: 2025-01-06 20:42:53 +00:00)


#### **Did not pursue Phase 2 Experiments for GPT2 fully because of its long execution time for all the 12 configurations. Phase 1 took 4hrs for 8 combinations, so this is bound to take a lot more time.**