##### **Installing dependencies**

In [1]:
!pip install ipython-autotime gdown evaluate accelerate bitsandbytes peft loralib huggingface_hub transformers peft

Collecting ipython-autotime
  Downloading ipython_autotime-0.3.2-py2.py3-none-any.whl.metadata (1.4 kB)
Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting bitsandbytes
  Downloading bitsandbytes-0.45.0-py3-none-manylinux_2_24_x86_64.whl.metadata (2.9 kB)
Collecting peft
  Downloading peft-0.14.0-py3-none-any.whl.metadata (13 kB)
Collecting loralib
  Downloading loralib-0.1.2-py3-none-any.whl.metadata (15 kB)
Collecting huggingface_hub
  Downloading huggingface_hub-0.27.0-py3-none-any.whl.metadata (13 kB)
Downloading ipython_autotime-0.3.2-py2.py3-none-any.whl (7.0 kB)
Downloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading bitsandbytes-0.45.0-py3-none-manylinux_2_24_x86_64.whl (69.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m69.1/69.1 MB[0m [31m27.4 MB/s[0m eta [36m0:00:00[0m:00:01[0m

##### **Importing dependencies**

In [2]:
%load_ext autotime
import pandas as pd
import numpy as np
import nltk
import os
import zipfile
import tarfile
import re
import gdown
import gzip
import shutil
import wandb
import time
import torch
import psutil
# import torch_xla
# import torch_xla.core.xla_model as xm
# import torch_xla.debug.metrics as met

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report, precision_recall_fscore_support
from datasets import Dataset

from transformers import (
    AutoTokenizer,
    AutoModelForSeq2SeqLM,
    AutoModelForCausalLM,
    AutoModelForSequenceClassification,
    DistilBertTokenizerFast,
    DistilBertForSequenceClassification,
    RobertaTokenizerFast, 
    RobertaForSequenceClassification,
    GPT2TokenizerFast, 
    GPT2ForSequenceClassification,
    GenerationConfig,
    TrainingArguments,
    Trainer,
    pipeline,
    BitsAndBytesConfig,
    DataCollatorForSeq2Seq,
    DataCollatorWithPadding,
    AdamW,
    get_scheduler
)
import torch
from torch.utils.data import DataLoader, Dataset
from tqdm import tqdm
import time
import evaluate
from peft import (
    LoraConfig,
    get_peft_model,
    TaskType,
    PeftModel,
    PeftConfig,
)
from huggingface_hub import login
import kagglehub

# from nltk.corpus import stopwords
# from nltk import word_tokenize
# from nltk.stem import WordNetLemmatizer
# from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
# from sklearn.metrics import accuracy_score
# from sklearn.naive_bayes import MultinomialNB
# from sklearn.linear_model import LogisticRegression
# from sklearn.neighbors import KNeighborsClassifier
# from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
# from google.colab import files
# from scipy.sparse import hstack
# from gensim.models import Word2Vec

import warnings

# Suppress specific warnings
warnings.filterwarnings("ignore", message=".*clean_up_tokenization_spaces.*")
# warnings.filterwarnings("ignore", message="Some weights of DistilBertForSequenceClassification were not initialized.*")
warnings.filterwarnings("ignore", message="Some weights of RobertaForSequenceClassification were not initialized.*")
warnings.filterwarnings("ignore", category=FutureWarning, message=".*GradScaler.*")
warnings.filterwarnings("ignore", message=".*evaluation_strategy.*")
warnings.filterwarnings("ignore", message=".*gather along dimension 0.*")

time: 17.8 s (started: 2025-01-06 00:29:57 +00:00)


In [3]:
# Disable wandb Logging
os.environ["WANDB_MODE"] = "disabled"
wandb.init()

# Check for GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")
# device = xm.xla_device()  # Change device to TPU

Using device: cuda
time: 6.9 s (started: 2025-01-06 00:30:24 +00:00)


##### **Supporting functions**

In [4]:
def clean_review(review):
    review = re.sub(r'<.*?>', '', review)
    review = re.sub(r'http\S+|www\S+|https\S+', '', review, flags=re.MULTILINE)
    review = review.strip()
    return review

def preprocess_function(examples):
    inputs = tokenizer(examples["review"], truncation=True, padding=True, max_length=128)
    inputs["labels"] = [1 if label.lower() == "positive" else 0 for label in examples["sentiment"]]
    return inputs

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    acc = accuracy_score(labels, preds)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average="weighted")
    return {"accuracy": acc, "precision": precision, "recall": recall, "f1": f1}

time: 717 µs (started: 2025-01-06 00:30:31 +00:00)


##### **Loading data**

In [5]:
train_df_full = pd.read_csv("/kaggle/input/imdb-dataset/train.csv")
train_df = train_df_full.sample(n=3000, random_state=42)
train_df['review'] = train_df['review'].apply(clean_review)
train_df.reset_index(drop=True, inplace=True)

time: 945 ms (started: 2025-01-06 00:30:35 +00:00)


In [6]:
test_df_full = pd.read_csv("/kaggle/input/imdb-dataset/test.csv")
test_df = test_df_full.sample(n=2000, random_state=42)
test_df['review'] = test_df['review'].apply(clean_review)
test_df.reset_index(drop=True, inplace=True)

time: 562 ms (started: 2025-01-06 00:30:37 +00:00)


In [7]:
from datasets import Dataset

train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)

time: 70.9 ms (started: 2025-01-06 00:30:45 +00:00)


### **Experimentations for RoBERTa - Phase 1:** keeping LoRA hyperparams fixed

In [8]:
model_checkpoint = "roberta-base"
tokenizer = RobertaTokenizerFast.from_pretrained(model_checkpoint)
model = RobertaForSequenceClassification.from_pretrained(model_checkpoint, num_labels=2).to(device)

tokenized_train = train_dataset.map(preprocess_function, batched=True)
tokenized_test = test_dataset.map(preprocess_function, batched=True)

# Fixed LoRA parameters
rank = 8 
target_matrices = ["attention.self.query", "attention.self.key", "attention.self.value"]
# target_matrices = ["attention.self.query", "attention.self.key", "attention.self.value", "attention.output.dense"]
lora_alpha = 16
lora_dropout = 0.1

# Changing hyperparams for batch size, epochs and learning rates
batch_sizes = [8, 16]
epochs_list = [3, 5]
learning_rates = [3e-5, 1e-4]

training_dropout = 0.1 # Fixed

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/3000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

time: 11.3 s (started: 2025-01-05 23:32:42 +00:00)


In [10]:
print(f"Model is running on device: {model.device}")

Model is running on device: cuda:0
time: 527 µs (started: 2025-01-05 23:33:00 +00:00)


In [11]:
results_phase_1 = []

for batch_size in batch_sizes:
    for epochs in epochs_list:
        for learning_rate in learning_rates:
            lora_config = LoraConfig(
                r=rank,
                lora_alpha=lora_alpha,
                target_modules=target_matrices,
                lora_dropout=lora_dropout,
                task_type="SEQ_CLS"
            )

            model_with_lora = get_peft_model(model, lora_config)
            
            start_time = time.time()
            print(f"\nRunning experiment with: Batch Size: {batch_size}, Epochs: {epochs}, Learning Rate: {learning_rate}")

            num_parameters = sum(p.numel() for p in model_with_lora.parameters())
            trainable_parameters = sum(p.numel() for p in model_with_lora.parameters() if p.requires_grad)
            trainable_percentage = (trainable_parameters / num_parameters) * 100
            
            print(f"Model has {num_parameters:,} total parameters")
            print(f"Model has {trainable_parameters:,} trainable parameters")
            print(f"{trainable_percentage:.2f}% of the parameters are trainable")

            if torch.cuda.is_available():
                torch.cuda.empty_cache()
                gpu_memory = torch.cuda.memory_allocated() / 1024**2  # in MB
                print(f"GPU memory allocated: {gpu_memory:.2f} MB")

            wandb.config.update({"model/num_parameters": model.num_parameters()}, allow_val_change=True)

            output_dir = f"./results_phase1_r{rank}_alpha{lora_alpha}_drop{lora_dropout}_targets{'_'.join(target_matrices)}_bs{batch_size}_epochs{epochs}_lr{learning_rate}"
            training_args = TrainingArguments(
                output_dir=output_dir,
                evaluation_strategy="epoch",
                learning_rate=learning_rate,
                per_device_train_batch_size=8,
                per_device_eval_batch_size=batch_size,
                num_train_epochs=epochs,
                weight_decay=0.01,
                save_total_limit=1,
                save_strategy="epoch",
                logging_dir="./logs",
                logging_steps=10,
                load_best_model_at_end=True
            )

            trainer = Trainer(
                model=model_with_lora,
                args=training_args,
                train_dataset=tokenized_train,
                eval_dataset=tokenized_test,
                tokenizer=tokenizer,
                compute_metrics=compute_metrics
            )

            trainer.train()
            metrics = trainer.evaluate()

            end_time = time.time()
            elapsed_time = end_time - start_time
            print(f"Training time: {elapsed_time:.2f} seconds")

            results_phase_1.append({
                "Model": "RoBERTa",
                "Batch Size": batch_size,
                "Epochs": epochs,
                "Learning Rate": learning_rate,
                "Rank": rank,
                "Alpha": lora_alpha,
                "LoRA Dropout": lora_dropout,
                "Target Matrices": target_matrices,
                "Accuracy": metrics["eval_accuracy"],
                "Precision": metrics["eval_precision"],
                "Recall": metrics["eval_recall"],
                "F1-Score": metrics["eval_f1"]                
            })


Running experiment with: Batch Size: 8, Epochs: 3, Learning Rate: 3e-05
Model has 125,681,668 total parameters
Model has 1,034,498 trainable parameters
0.82% of the parameters are trainable
GPU memory allocated: 480.68 MB




Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.6659,0.650086,0.846,0.85079,0.846,0.845075
2,0.2956,0.366737,0.86,0.861138,0.86,0.860032
3,0.2923,0.337767,0.8745,0.874692,0.8745,0.874412


Training time: 140.11 seconds

Running experiment with: Batch Size: 8, Epochs: 3, Learning Rate: 0.0001
Model has 125,681,668 total parameters
Model has 1,034,498 trainable parameters
0.82% of the parameters are trainable
GPU memory allocated: 508.77 MB


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.2324,0.308161,0.89,0.891022,0.89,0.889808
2,0.2354,0.315274,0.8925,0.892545,0.8925,0.892511
3,0.2264,0.302369,0.893,0.893669,0.893,0.892859


Training time: 139.22 seconds

Running experiment with: Batch Size: 8, Epochs: 5, Learning Rate: 3e-05
Model has 125,681,668 total parameters
Model has 1,034,498 trainable parameters
0.82% of the parameters are trainable
GPU memory allocated: 508.77 MB


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.541,0.484581,0.8595,0.86139,0.8595,0.85909
2,0.2844,0.377795,0.8615,0.863891,0.8615,0.861482
3,0.3069,0.309983,0.879,0.879258,0.879,0.878903
4,0.3986,0.323433,0.8825,0.883524,0.8825,0.88229
5,0.2078,0.320414,0.8825,0.884163,0.8825,0.882209


Training time: 225.44 seconds

Running experiment with: Batch Size: 8, Epochs: 5, Learning Rate: 0.0001
Model has 125,681,668 total parameters
Model has 1,034,498 trainable parameters
0.82% of the parameters are trainable
GPU memory allocated: 508.77 MB


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.237,0.292744,0.8885,0.890105,0.8885,0.888235
2,0.2342,0.324362,0.8955,0.895791,0.8955,0.895527
3,0.1927,0.304498,0.8935,0.897061,0.8935,0.893057
4,0.3266,0.304133,0.898,0.897992,0.898,0.897993
5,0.1158,0.307258,0.8995,0.900699,0.8995,0.899311


Training time: 225.48 seconds

Running experiment with: Batch Size: 16, Epochs: 3, Learning Rate: 3e-05
Model has 125,681,668 total parameters
Model has 1,034,498 trainable parameters
0.82% of the parameters are trainable
GPU memory allocated: 508.77 MB


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.6306,0.60654,0.855,0.858986,0.855,0.854263
2,0.2922,0.360492,0.861,0.861907,0.861,0.861037
3,0.2912,0.340752,0.8765,0.876816,0.8765,0.876388


Training time: 134.58 seconds

Running experiment with: Batch Size: 16, Epochs: 3, Learning Rate: 0.0001
Model has 125,681,668 total parameters
Model has 1,034,498 trainable parameters
0.82% of the parameters are trainable
GPU memory allocated: 508.77 MB


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.2324,0.308161,0.89,0.891022,0.89,0.889808
2,0.2354,0.315274,0.8925,0.892545,0.8925,0.892511
3,0.2264,0.302369,0.893,0.893669,0.893,0.892859


Training time: 134.21 seconds

Running experiment with: Batch Size: 16, Epochs: 5, Learning Rate: 3e-05
Model has 125,681,668 total parameters
Model has 1,034,498 trainable parameters
0.82% of the parameters are trainable
GPU memory allocated: 508.77 MB


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.541,0.484581,0.8595,0.86139,0.8595,0.85909
2,0.2844,0.377795,0.8615,0.863891,0.8615,0.861482
3,0.3069,0.309983,0.879,0.879258,0.879,0.878903
4,0.3986,0.323433,0.8825,0.883524,0.8825,0.88229
5,0.2078,0.320414,0.8825,0.884163,0.8825,0.882209


Training time: 217.67 seconds

Running experiment with: Batch Size: 16, Epochs: 5, Learning Rate: 0.0001
Model has 125,681,668 total parameters
Model has 1,034,498 trainable parameters
0.82% of the parameters are trainable
GPU memory allocated: 508.77 MB


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.237,0.292744,0.8885,0.890105,0.8885,0.888235
2,0.2342,0.324362,0.8955,0.895791,0.8955,0.895527
3,0.1927,0.304498,0.8935,0.897061,0.8935,0.893057
4,0.3266,0.304133,0.898,0.897992,0.898,0.897993
5,0.1158,0.307257,0.8995,0.900699,0.8995,0.899311


Training time: 217.75 seconds
time: 23min 54s (started: 2025-01-05 23:33:08 +00:00)


In [12]:
# Testing evaluations saved
results_df_phase_1 = pd.DataFrame(results_phase_1)
results_df_phase_1.to_csv("6_FT_RoBERTa_Experiments_FixedLoRA.csv", index=False)

time: 5.18 ms (started: 2025-01-05 23:57:11 +00:00)


### **Experimentations for RoBERTa - Phase 2:** changing LoRA hyperparameters

In [8]:
model_checkpoint = "roberta-base"
tokenizer = RobertaTokenizerFast.from_pretrained(model_checkpoint)
model = RobertaForSequenceClassification.from_pretrained(model_checkpoint, num_labels=2).to(device)

tokenized_train = train_dataset.map(preprocess_function, batched=True)
tokenized_test = test_dataset.map(preprocess_function, batched=True)

# Fixed parameters for batch size and epochs, etc
fixed_batch_size = 8
fixed_epochs = 5
fixed_learning_rate = 1e-4
training_dropout = 0.1

# LoRA parameter combinations
ranks = [8, 16]
target_matrices_list = [
    ["attention.self.query"],
    ["attention.self.query", "attention.self.key"],
    ["attention.self.query", "attention.self.key", "attention.self.value"]
]
lora_alpha = 16
lora_dropouts = [0.1, 0.2]

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/3000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

time: 10.6 s (started: 2025-01-06 00:30:59 +00:00)


In [9]:
results_phase_2 = []

for rank in ranks:
    for target_matrices in target_matrices_list:
        for lora_dropout in lora_dropouts:
            lora_config = LoraConfig(
                r=rank,
                lora_alpha=lora_alpha,  # Fixed lora_alpha
                target_modules=target_matrices,
                lora_dropout=lora_dropout,
                task_type="SEQ_CLS"
            )

            model_with_lora = get_peft_model(model, lora_config)

            start_time = time.time()
            print(f"\nRunning experiment with: Rank: {rank}, Target Matrices: {target_matrices}, LoRA Dropout: {lora_dropout}")

            num_parameters = sum(p.numel() for p in model_with_lora.parameters())
            trainable_parameters = sum(p.numel() for p in model_with_lora.parameters() if p.requires_grad)
            trainable_percentage = (trainable_parameters / num_parameters) * 100
            
            print(f"Model has {num_parameters:,} total parameters")
            print(f"Model has {trainable_parameters:,} trainable parameters")
            print(f"{trainable_percentage:.2f}% of the parameters are trainable")

            if torch.cuda.is_available():
                torch.cuda.empty_cache()
                gpu_memory = torch.cuda.memory_allocated() / 1024**2  # in MB
                print(f"GPU memory allocated: {gpu_memory:.2f} MB")

            wandb.config.update({"model/num_parameters": model.num_parameters()}, allow_val_change=True)

            output_dir = f"./results_phase2_r{rank}_alpha{lora_alpha}_drop{lora_dropout}_targets{'_'.join(target_matrices)}_bs{fixed_batch_size}_epochs{fixed_epochs}_lr{fixed_learning_rate}"
            training_args = TrainingArguments(
                output_dir=output_dir,
                evaluation_strategy="epoch",
                learning_rate=fixed_learning_rate,
                per_device_train_batch_size=fixed_batch_size,
                per_device_eval_batch_size=fixed_batch_size,
                num_train_epochs=fixed_epochs,
                weight_decay=0.01,
                save_total_limit=1,
                save_strategy="epoch",
                logging_dir="./logs",
                logging_steps=10,
                load_best_model_at_end=True
            )

            trainer = Trainer(
                model=model_with_lora,
                args=training_args,
                train_dataset=tokenized_train,
                eval_dataset=tokenized_test,
                tokenizer=tokenizer,
                compute_metrics=compute_metrics
            )

            trainer.train()
            metrics = trainer.evaluate()

            end_time = time.time()
            elapsed_time = end_time - start_time
            print(f"Training time: {elapsed_time:.2f} seconds")

            results_phase_2.append({
                "Model": "RoBERTa",
                "Batch Size": fixed_batch_size,
                "Epochs": fixed_epochs,
                "Learning Rate": fixed_learning_rate,
                "Rank": rank,
                "Alpha": lora_alpha,  # Fixed alpha
                "LoRA Dropout": lora_dropout,
                "Target Matrices": target_matrices,
                "Accuracy": metrics["eval_accuracy"],
                "Precision": metrics["eval_precision"],
                "Recall": metrics["eval_recall"],
                "F1-Score": metrics["eval_f1"]
            })


Running experiment with: Rank: 8, Target Matrices: ['attention.self.query'], LoRA Dropout: 0.1
Model has 125,386,756 total parameters
Model has 739,586 trainable parameters
0.59% of the parameters are trainable
GPU memory allocated: 479.56 MB




Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.3233,0.306576,0.8755,0.875508,0.8755,0.875467
2,0.2693,0.312752,0.8855,0.885496,0.8855,0.88548
3,0.2536,0.303317,0.89,0.890477,0.89,0.889881
4,0.479,0.313724,0.8895,0.890188,0.8895,0.88935
5,0.2345,0.314326,0.891,0.891947,0.891,0.89082


Training time: 202.17 seconds

Running experiment with: Rank: 8, Target Matrices: ['attention.self.query'], LoRA Dropout: 0.2
Model has 125,386,756 total parameters
Model has 739,586 trainable parameters
0.59% of the parameters are trainable
GPU memory allocated: 504.27 MB


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.3299,0.309357,0.871,0.87101,0.871,0.871004
2,0.2708,0.31779,0.882,0.88201,0.882,0.882004
3,0.2528,0.302683,0.8895,0.88965,0.8895,0.889436
4,0.4813,0.315852,0.8895,0.889802,0.8895,0.889408
5,0.2312,0.316203,0.889,0.889654,0.889,0.888854


Training time: 201.62 seconds

Running experiment with: Rank: 8, Target Matrices: ['attention.self.query', 'attention.self.key'], LoRA Dropout: 0.1
Model has 125,534,212 total parameters
Model has 887,042 trainable parameters
0.71% of the parameters are trainable
GPU memory allocated: 504.83 MB


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.2146,0.324363,0.874,0.874064,0.874,0.873944
2,0.3303,0.327672,0.8855,0.885565,0.8855,0.885453
3,0.2646,0.308685,0.8895,0.889896,0.8895,0.889392
4,0.4009,0.312831,0.89,0.890059,0.89,0.889957
5,0.2346,0.316788,0.8895,0.890124,0.8895,0.889359


Training time: 212.01 seconds

Running experiment with: Rank: 8, Target Matrices: ['attention.self.query', 'attention.self.key'], LoRA Dropout: 0.2
Model has 125,534,212 total parameters
Model has 887,042 trainable parameters
0.71% of the parameters are trainable
GPU memory allocated: 506.52 MB


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.2187,0.332971,0.8725,0.872497,0.8725,0.872472
2,0.3436,0.332609,0.882,0.882052,0.882,0.881954
3,0.2642,0.315015,0.8895,0.890124,0.8895,0.889359
4,0.3971,0.314103,0.8885,0.888592,0.8885,0.888448
5,0.2404,0.319513,0.89,0.890724,0.89,0.889846


Training time: 212.13 seconds

Running experiment with: Rank: 8, Target Matrices: ['attention.self.query', 'attention.self.key', 'attention.self.value'], LoRA Dropout: 0.1
Model has 125,681,668 total parameters
Model has 1,034,498 trainable parameters
0.82% of the parameters are trainable
GPU memory allocated: 507.08 MB


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.2309,0.312235,0.8865,0.888753,0.8865,0.886156
2,0.248,0.315293,0.8945,0.895048,0.8945,0.89453
3,0.2096,0.307568,0.89,0.89423,0.89,0.889473
4,0.3264,0.308163,0.903,0.902995,0.903,0.902989
5,0.1557,0.314265,0.8995,0.900883,0.8995,0.899292


Training time: 222.66 seconds

Running experiment with: Rank: 8, Target Matrices: ['attention.self.query', 'attention.self.key', 'attention.self.value'], LoRA Dropout: 0.2
Model has 125,681,668 total parameters
Model has 1,034,498 trainable parameters
0.82% of the parameters are trainable
GPU memory allocated: 508.77 MB


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.2487,0.300775,0.886,0.88744,0.886,0.885747
2,0.2402,0.318364,0.8925,0.893109,0.8925,0.892531
3,0.2094,0.314554,0.8885,0.89296,0.8885,0.887942
4,0.3341,0.31222,0.902,0.901993,0.902,0.901993
5,0.1541,0.309949,0.8985,0.899607,0.8985,0.898318


Training time: 223.18 seconds

Running experiment with: Rank: 16, Target Matrices: ['attention.self.query'], LoRA Dropout: 0.1
Model has 125,829,124 total parameters
Model has 1,181,954 trainable parameters
0.94% of the parameters are trainable
GPU memory allocated: 508.21 MB


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.1795,0.338044,0.8915,0.893231,0.8915,0.891231
2,0.257,0.328147,0.8955,0.895642,0.8955,0.895521
3,0.2144,0.327865,0.892,0.897144,0.892,0.891402
4,0.3117,0.318461,0.9015,0.901504,0.9015,0.901502
5,0.176,0.32362,0.8985,0.899694,0.8985,0.898309


Training time: 223.07 seconds

Running experiment with: Rank: 16, Target Matrices: ['attention.self.query'], LoRA Dropout: 0.2
Model has 125,829,124 total parameters
Model has 1,181,954 trainable parameters
0.94% of the parameters are trainable
GPU memory allocated: 509.90 MB


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.1281,0.451681,0.8785,0.88751,0.8785,0.877413
2,0.2344,0.412281,0.896,0.896045,0.896,0.895965
3,0.1579,0.39757,0.89,0.894925,0.89,0.889408
4,0.2929,0.369199,0.902,0.901994,0.902,0.901989
5,0.1395,0.371487,0.903,0.903775,0.903,0.902865


Training time: 227.74 seconds

Running experiment with: Rank: 16, Target Matrices: ['attention.self.query', 'attention.self.key'], LoRA Dropout: 0.1
Model has 125,976,580 total parameters
Model has 1,329,410 trainable parameters
1.06% of the parameters are trainable
GPU memory allocated: 511.02 MB


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.1392,0.49634,0.87,0.882881,0.87,0.86841
2,0.1375,0.435731,0.8965,0.896555,0.8965,0.896463
3,0.157,0.441042,0.8885,0.894439,0.8885,0.887803
4,0.2716,0.396598,0.898,0.897995,0.898,0.897996
5,0.1774,0.413664,0.9005,0.901534,0.9005,0.900331


Training time: 227.72 seconds

Running experiment with: Rank: 16, Target Matrices: ['attention.self.query', 'attention.self.key'], LoRA Dropout: 0.2
Model has 125,976,580 total parameters
Model has 1,329,410 trainable parameters
1.06% of the parameters are trainable
GPU memory allocated: 512.71 MB


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.1383,0.595082,0.869,0.883046,0.869,0.867275
2,0.1127,0.496073,0.897,0.897253,0.897,0.896924
3,0.1387,0.458587,0.894,0.898815,0.894,0.893446
4,0.318,0.455016,0.8975,0.897495,0.8975,0.897487
5,0.1703,0.45746,0.8975,0.898602,0.8975,0.897317


Training time: 223.10 seconds

Running experiment with: Rank: 16, Target Matrices: ['attention.self.query', 'attention.self.key', 'attention.self.value'], LoRA Dropout: 0.1
Model has 126,124,036 total parameters
Model has 1,476,866 trainable parameters
1.17% of the parameters are trainable
GPU memory allocated: 513.83 MB


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.2175,0.320017,0.883,0.887625,0.883,0.882388
2,0.2357,0.30833,0.8935,0.893885,0.8935,0.893529
3,0.2047,0.310269,0.892,0.896604,0.892,0.891451
4,0.3184,0.298299,0.9,0.900069,0.9,0.899961
5,0.1452,0.301902,0.897,0.898416,0.897,0.896782


Training time: 223.38 seconds

Running experiment with: Rank: 16, Target Matrices: ['attention.self.query', 'attention.self.key', 'attention.self.value'], LoRA Dropout: 0.2
Model has 126,124,036 total parameters
Model has 1,476,866 trainable parameters
1.17% of the parameters are trainable
GPU memory allocated: 515.52 MB


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.24,0.295665,0.886,0.887537,0.886,0.885735
2,0.2334,0.309105,0.893,0.893519,0.893,0.893031
3,0.2003,0.309078,0.8905,0.895166,0.8905,0.889935
4,0.3164,0.3018,0.903,0.903007,0.903,0.902981
5,0.1478,0.303627,0.8975,0.898778,0.8975,0.897298


Training time: 222.71 seconds
time: 43min 41s (started: 2025-01-06 00:31:16 +00:00)


In [10]:
# Testing evaluations saved
results_df_phase_2 = pd.DataFrame(results_phase_2)
results_df_phase_2.to_csv("6_FT_RoBERTa_Experiments_FixedTrainingHyp.csv", index=False)

time: 5.99 ms (started: 2025-01-06 01:19:28 +00:00)
