##### **Installing dependencies**

In [1]:
!pip install ipython-autotime gdown evaluate accelerate bitsandbytes peft loralib huggingface_hub transformers peft

Collecting ipython-autotime
  Downloading ipython_autotime-0.3.2-py2.py3-none-any.whl.metadata (1.4 kB)
Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting bitsandbytes
  Downloading bitsandbytes-0.45.0-py3-none-manylinux_2_24_x86_64.whl.metadata (2.9 kB)
Collecting peft
  Downloading peft-0.14.0-py3-none-any.whl.metadata (13 kB)
Collecting loralib
  Downloading loralib-0.1.2-py3-none-any.whl.metadata (15 kB)
Collecting huggingface_hub
  Downloading huggingface_hub-0.27.0-py3-none-any.whl.metadata (13 kB)
Downloading ipython_autotime-0.3.2-py2.py3-none-any.whl (7.0 kB)
Downloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading bitsandbytes-0.45.0-py3-none-manylinux_2_24_x86_64.whl (69.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m69.1/69.1 MB[0m [31m24.0 MB/s[0m eta [36m0:00:00[0m:00:01[0m

##### **Importing dependencies**

In [2]:
%load_ext autotime
import pandas as pd
import numpy as np
import nltk
import os
import zipfile
import tarfile
import re
import gdown
import gzip
import shutil
import wandb
import time
import torch
import psutil

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report, precision_recall_fscore_support
from datasets import Dataset

from transformers import (
    AutoTokenizer,
    AutoModelForSeq2SeqLM,
    AutoModelForCausalLM,
    AutoModelForSequenceClassification,
    DistilBertTokenizerFast,
    DistilBertForSequenceClassification,
    RobertaTokenizerFast, 
    RobertaForSequenceClassification,
    GPT2TokenizerFast, 
    GPT2ForSequenceClassification,
    AlbertTokenizer, 
    AlbertForSequenceClassification,
    GenerationConfig,
    TrainingArguments,
    Trainer,
    pipeline,
    BitsAndBytesConfig,
    DataCollatorForSeq2Seq,
    DataCollatorWithPadding,
    AdamW,
    get_scheduler
)
import torch
from torch.utils.data import DataLoader, Dataset
from tqdm import tqdm
import time
import evaluate
from peft import (
    LoraConfig,
    get_peft_model,
    TaskType,
    PeftModel,
    PeftConfig,
)
from huggingface_hub import login
import kagglehub

# from nltk.corpus import stopwords
# from nltk import word_tokenize
# from nltk.stem import WordNetLemmatizer
# from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
# from sklearn.metrics import accuracy_score
# from sklearn.naive_bayes import MultinomialNB
# from sklearn.linear_model import LogisticRegression
# from sklearn.neighbors import KNeighborsClassifier
# from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
# from google.colab import files
# from scipy.sparse import hstack
# from gensim.models import Word2Vec

import warnings

# Suppress specific warnings
warnings.filterwarnings("ignore", message=".*clean_up_tokenization_spaces.*")
warnings.filterwarnings("ignore", message="Some weights of DistilBertForSequenceClassification were not initialized.*")
warnings.filterwarnings("ignore", message="Some weights of AlbertForSequenceClassification were not initialized.*")
warnings.filterwarnings("ignore", message=".*evaluation_strategy.*")
warnings.filterwarnings("ignore", message=".*gather along dimension 0.*")
warnings.filterwarnings("ignore", message=".*GradScaler.*")

time: 32.3 s (started: 2025-01-06 11:07:38 +00:00)


In [3]:
# Disable wandb Logging
os.environ["WANDB_MODE"] = "disabled"
wandb.init()

# Check for GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda
time: 5.81 s (started: 2025-01-06 11:08:14 +00:00)


##### **Supporting functions**

In [4]:
def clean_review(review):
    review = re.sub(r'<.*?>', '', review)
    review = re.sub(r'http\S+|www\S+|https\S+', '', review, flags=re.MULTILINE)
    review = review.strip()
    return review

def preprocess_function(examples):
    inputs = tokenizer(examples["review"], truncation=True, padding=True, max_length=512)
    inputs["labels"] = [1 if label.lower() == "positive" else 0 for label in examples["sentiment"]]
    return inputs

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    acc = accuracy_score(labels, preds)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average="weighted")
    return {"accuracy": acc, "precision": precision, "recall": recall, "f1": f1}

time: 840 µs (started: 2025-01-06 11:08:21 +00:00)


##### **Loading data**

In [5]:
train_df_full = pd.read_csv("/kaggle/input/imdb-dataset-3/train.csv")
train_df = train_df_full.sample(n=3000, random_state=42)
train_df['review'] = train_df['review'].apply(clean_review)
train_df.reset_index(drop=True, inplace=True)

time: 1.03 s (started: 2025-01-06 11:08:28 +00:00)


In [6]:
test_df_full = pd.read_csv("/kaggle/input/imdb-dataset-3/test.csv")
test_df = test_df_full.sample(n=2000, random_state=42)
test_df['review'] = test_df['review'].apply(clean_review)
test_df.reset_index(drop=True, inplace=True)

time: 654 ms (started: 2025-01-06 11:08:29 +00:00)


In [7]:
from datasets import Dataset

train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)

time: 83.4 ms (started: 2025-01-06 11:08:29 +00:00)


### **Experimentations for ALBERT - Phase 1:** keeping LoRA hyperparams fixed

In [8]:
model_checkpoint = "albert-base-v2"
tokenizer = AlbertTokenizer.from_pretrained(model_checkpoint)
model = AlbertForSequenceClassification.from_pretrained(model_checkpoint, num_labels=2).to(device)

tokenized_train = train_dataset.map(preprocess_function, batched=True)
tokenized_test = test_dataset.map(preprocess_function, batched=True)

# Fixed LoRA parameters
rank = 8 
target_matrices = ["attention.query", "attention.key", "attention.value"]
lora_alpha = 16
lora_dropout = 0.1

# Changing hyperparams for batch size, epochs and learning rates
batch_sizes = [4, 8]
epochs_list = [3, 5]
learning_rates = [3e-5, 1e-4]

training_dropout = 0.1 # Fixed

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/760k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.31M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/684 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/47.4M [00:00<?, ?B/s]

Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at albert-base-v2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/3000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

time: 12.9 s (started: 2025-01-06 03:18:03 +00:00)


In [9]:
print(f"Model is running on device: {model.device}")

Model is running on device: cuda:0
time: 519 µs (started: 2025-01-06 03:18:23 +00:00)


In [10]:
results_phase_1 = []

for batch_size in batch_sizes:
    for epochs in epochs_list:
        for learning_rate in learning_rates:
            lora_config = LoraConfig(
                r=rank,
                lora_alpha=lora_alpha,
                target_modules=target_matrices,
                lora_dropout=lora_dropout,
                task_type="SEQ_CLS"
            )

            model_with_lora = get_peft_model(model, lora_config)
            
            start_time = time.time()
            print(f"\nRunning experiment with: Batch Size: {batch_size}, Epochs: {epochs}, Learning Rate: {learning_rate}")

            num_parameters = sum(p.numel() for p in model_with_lora.parameters())
            trainable_parameters = sum(p.numel() for p in model_with_lora.parameters() if p.requires_grad)
            trainable_percentage = (trainable_parameters / num_parameters) * 100
            
            print(f"Model has {num_parameters:,} total parameters")
            print(f"Model has {trainable_parameters:,} trainable parameters")
            print(f"{trainable_percentage:.2f}% of the parameters are trainable")

            if torch.cuda.is_available():
                torch.cuda.empty_cache()
                gpu_memory = torch.cuda.memory_allocated() / 1024**2  # in MB
                print(f"GPU memory allocated: {gpu_memory:.2f} MB")

            wandb.config.update({"model/num_parameters": model.num_parameters()}, allow_val_change=True)

            output_dir = f"./results_phase1_r{rank}_alpha{lora_alpha}_drop{lora_dropout}_targets{'_'.join(target_matrices)}_bs{batch_size}_epochs{epochs}_lr{learning_rate}"
            training_args = TrainingArguments(
                output_dir=output_dir,
                evaluation_strategy="epoch",
                learning_rate=learning_rate,
                per_device_train_batch_size=batch_size,
                per_device_eval_batch_size=batch_size,
                num_train_epochs=epochs,
                weight_decay=0.01,
                save_total_limit=1,
                save_strategy="epoch",
                logging_dir="./logs",
                logging_steps=10,
                load_best_model_at_end=True,
            )

            trainer = Trainer(
                model=model_with_lora,
                args=training_args,
                train_dataset=tokenized_train,
                eval_dataset=tokenized_test,
                tokenizer=tokenizer,
                compute_metrics=compute_metrics
            )

            trainer.train()
            metrics = trainer.evaluate()

            end_time = time.time()
            elapsed_time = end_time - start_time
            print(f"Training time: {elapsed_time:.2f} seconds")

            results_phase_1.append({
                "Model": "ALBERT",
                "Batch Size": batch_size,
                "Epochs": epochs,
                "Learning Rate": learning_rate,
                "Rank": rank,
                "Alpha": lora_alpha,
                "LoRA Dropout": lora_dropout,
                "Target Matrices": target_matrices,
                "Accuracy": metrics["eval_accuracy"],
                "Precision": metrics["eval_precision"],
                "Recall": metrics["eval_recall"],
                "F1-Score": metrics["eval_f1"]                
            })


Running experiment with: Batch Size: 4, Epochs: 3, Learning Rate: 3e-05
Model has 11,723,524 total parameters
Model has 38,402 trainable parameters
0.33% of the parameters are trainable
GPU memory allocated: 44.73 MB




Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.6686,0.670187,0.6035,0.624403,0.6035,0.577508
2,0.5068,0.531487,0.776,0.783831,0.776,0.775257
3,0.395,0.440732,0.8205,0.82189,0.8205,0.820528


Training time: 646.55 seconds

Running experiment with: Batch Size: 4, Epochs: 3, Learning Rate: 0.0001
Model has 11,723,524 total parameters
Model has 38,402 trainable parameters
0.33% of the parameters are trainable
GPU memory allocated: 61.42 MB


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.2452,0.305836,0.8825,0.883307,0.8825,0.882533
2,0.2157,0.293271,0.911,0.9112,0.911,0.910946
3,0.1396,0.2919,0.911,0.910997,0.911,0.91099


Training time: 663.68 seconds

Running experiment with: Batch Size: 4, Epochs: 5, Learning Rate: 3e-05
Model has 11,723,524 total parameters
Model has 38,402 trainable parameters
0.33% of the parameters are trainable
GPU memory allocated: 61.42 MB


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.663,0.668339,0.607,0.640289,0.607,0.573069
2,0.3666,0.402713,0.841,0.846601,0.841,0.840752
3,0.1943,0.303722,0.8825,0.882679,0.8825,0.882525
4,0.3305,0.292062,0.892,0.892036,0.892,0.89201
5,0.2944,0.290203,0.896,0.896004,0.896,0.89598


Training time: 1068.76 seconds

Running experiment with: Batch Size: 4, Epochs: 5, Learning Rate: 0.0001
Model has 11,723,524 total parameters
Model has 38,402 trainable parameters
0.33% of the parameters are trainable
GPU memory allocated: 61.42 MB


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.3126,0.308228,0.88,0.882896,0.88,0.879966
2,0.2487,0.29361,0.9085,0.909243,0.9085,0.908526
3,0.1187,0.289587,0.917,0.917441,0.917,0.916922
4,0.4256,0.292842,0.9165,0.916636,0.9165,0.916517
5,0.1463,0.291703,0.918,0.918211,0.918,0.91795


Training time: 1069.37 seconds

Running experiment with: Batch Size: 8, Epochs: 3, Learning Rate: 3e-05
Model has 11,723,524 total parameters
Model has 38,402 trainable parameters
0.33% of the parameters are trainable
GPU memory allocated: 61.42 MB


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.7041,0.687831,0.5445,0.543094,0.5445,0.539779
2,0.6705,0.663635,0.625,0.624928,0.625,0.624958
3,0.6439,0.64922,0.6555,0.655452,0.6555,0.654646


Training time: 636.12 seconds

Running experiment with: Batch Size: 8, Epochs: 3, Learning Rate: 0.0001
Model has 11,723,524 total parameters
Model has 38,402 trainable parameters
0.33% of the parameters are trainable
GPU memory allocated: 61.42 MB


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.5539,0.446333,0.811,0.815649,0.811,0.810767
2,0.3241,0.285541,0.9,0.90099,0.9,0.899835
3,0.2101,0.267127,0.9,0.900149,0.9,0.899945


Training time: 635.88 seconds

Running experiment with: Batch Size: 8, Epochs: 5, Learning Rate: 3e-05
Model has 11,723,524 total parameters
Model has 38,402 trainable parameters
0.33% of the parameters are trainable
GPU memory allocated: 61.42 MB


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.703,0.68641,0.549,0.548161,0.549,0.539804
2,0.6536,0.645013,0.6695,0.672564,0.6695,0.669179
3,0.5191,0.527987,0.776,0.779537,0.776,0.775821
4,0.3998,0.420881,0.833,0.833523,0.833,0.833048
5,0.4331,0.393097,0.8475,0.847506,0.8475,0.847503


Training time: 1025.85 seconds

Running experiment with: Batch Size: 8, Epochs: 5, Learning Rate: 0.0001
Model has 11,723,524 total parameters
Model has 38,402 trainable parameters
0.33% of the parameters are trainable
GPU memory allocated: 61.42 MB


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.5163,0.390825,0.84,0.842739,0.84,0.839954
2,0.3212,0.265708,0.9045,0.905465,0.9045,0.904525
3,0.2119,0.24557,0.911,0.9112,0.911,0.910946
4,0.1502,0.244345,0.9145,0.914637,0.9145,0.914517
5,0.2786,0.238338,0.919,0.919143,0.919,0.91896


Training time: 1026.58 seconds
time: 1h 52min 52s (started: 2025-01-06 03:18:28 +00:00)


In [11]:
# Testing evaluations saved
results_df_phase_1 = pd.DataFrame(results_phase_1)
results_df_phase_1.to_csv("7_FT_ALBERT_Experiments_FixedLoRA.csv", index=False)

time: 5.55 ms (started: 2025-01-06 05:12:32 +00:00)


### **Experimentations for ALBERT - Phase 2:** changing LoRA hyperparameters

In [8]:
model_checkpoint = "albert-base-v2"
tokenizer = AlbertTokenizer.from_pretrained(model_checkpoint)
model = AlbertForSequenceClassification.from_pretrained(model_checkpoint, num_labels=2).to(device)

tokenized_train = train_dataset.map(preprocess_function, batched=True)
tokenized_test = test_dataset.map(preprocess_function, batched=True)

# Fixed parameters for batch size and epochs, etc
fixed_batch_size = 16
fixed_epochs = 5
fixed_learning_rate = 1e-4
training_dropout = 0.1

# LoRA parameter combinations
ranks = [8, 16]
target_matrices_list = [["attention.query"], ["attention.query", "attention.key"], ["attention.query", "attention.key", "attention.value"]]
lora_alpha = 16
lora_dropouts = [0.1, 0.2]

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/760k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.31M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/684 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/47.4M [00:00<?, ?B/s]

Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at albert-base-v2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/3000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

time: 14.6 s (started: 2025-01-06 11:08:49 +00:00)


In [9]:
results_phase_2 = []

for rank in ranks:
    for target_matrices in target_matrices_list:
        for lora_dropout in lora_dropouts:
            lora_config = LoraConfig(
                r=rank,
                lora_alpha=lora_alpha,  # Fixed lora_alpha
                target_modules=target_matrices,
                lora_dropout=lora_dropout,
                task_type="SEQ_CLS"
            )

            model_with_lora = get_peft_model(model, lora_config)

            start_time = time.time()
            print(f"\nRunning experiment with: Rank: {rank}, Target Matrices: {target_matrices}, LoRA Dropout: {lora_dropout}")

            num_parameters = sum(p.numel() for p in model_with_lora.parameters())
            trainable_parameters = sum(p.numel() for p in model_with_lora.parameters() if p.requires_grad)
            trainable_percentage = (trainable_parameters / num_parameters) * 100
            
            print(f"Model has {num_parameters:,} total parameters")
            print(f"Model has {trainable_parameters:,} trainable parameters")
            print(f"{trainable_percentage:.2f}% of the parameters are trainable")

            if torch.cuda.is_available():
                torch.cuda.empty_cache()
                gpu_memory = torch.cuda.memory_allocated() / 1024**2  # in MB
                print(f"GPU memory allocated: {gpu_memory:.2f} MB")

            wandb.config.update({"model/num_parameters": model.num_parameters()}, allow_val_change=True)

            output_dir = f"./results_phase2_r{rank}_alpha{lora_alpha}_drop{lora_dropout}_targets{'_'.join(target_matrices)}_bs{fixed_batch_size}_epochs{fixed_epochs}_lr{fixed_learning_rate}"
            training_args = TrainingArguments(
                output_dir=output_dir,
                evaluation_strategy="epoch",
                learning_rate=fixed_learning_rate,
                per_device_train_batch_size=fixed_batch_size,
                per_device_eval_batch_size=fixed_batch_size,
                num_train_epochs=fixed_epochs,
                weight_decay=0.01,
                save_total_limit=1,
                save_strategy="epoch",
                logging_dir="./logs",
                logging_steps=10,
                fp16=True,
                load_best_model_at_end=True,
            )

            trainer = Trainer(
                model=model_with_lora,
                args=training_args,
                train_dataset=tokenized_train,
                eval_dataset=tokenized_test,
                tokenizer=tokenizer,
                compute_metrics=compute_metrics
            )

            trainer.train()
            metrics = trainer.evaluate()

            end_time = time.time()
            elapsed_time = end_time - start_time
            print(f"Training time: {elapsed_time:.2f} seconds")

            results_phase_2.append({
                "Model": "ALBERT",
                "Batch Size": fixed_batch_size,
                "Epochs": fixed_epochs,
                "Learning Rate": fixed_learning_rate,
                "Rank": rank,
                "Alpha": lora_alpha,  # Fixed alpha
                "LoRA Dropout": lora_dropout,
                "Target Matrices": target_matrices,
                "Accuracy": metrics["eval_accuracy"],
                "Precision": metrics["eval_precision"],
                "Recall": metrics["eval_recall"],
                "F1-Score": metrics["eval_f1"]
            })


Running experiment with: Rank: 8, Target Matrices: ['attention.query'], LoRA Dropout: 0.1
Model has 11,698,948 total parameters
Model has 13,826 trainable parameters
0.12% of the parameters are trainable
GPU memory allocated: 44.64 MB




Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.6924,0.674462,0.5835,0.621132,0.5835,0.535353
2,0.6204,0.607277,0.687,0.691465,0.687,0.686398
3,0.4966,0.508597,0.76,0.7652,0.76,0.759561
4,0.4843,0.442865,0.8075,0.80775,0.8075,0.807284
5,0.4155,0.423513,0.815,0.815,0.815,0.814918


Training time: 898.23 seconds

Running experiment with: Rank: 8, Target Matrices: ['attention.query'], LoRA Dropout: 0.2
Model has 11,698,948 total parameters
Model has 13,826 trainable parameters
0.12% of the parameters are trainable
GPU memory allocated: 61.67 MB


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.6932,0.67577,0.58,0.614674,0.58,0.531958
2,0.6355,0.625485,0.672,0.679291,0.672,0.670483
3,0.5171,0.531594,0.744,0.748592,0.744,0.7436
4,0.4982,0.458264,0.7985,0.799679,0.7985,0.797957
5,0.4197,0.433823,0.8175,0.817781,0.8175,0.817295


Training time: 899.08 seconds

Running experiment with: Rank: 8, Target Matrices: ['attention.query', 'attention.key'], LoRA Dropout: 0.1
Model has 11,711,236 total parameters
Model has 26,114 trainable parameters
0.22% of the parameters are trainable
GPU memory allocated: 61.72 MB


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.6582,0.655283,0.625,0.640255,0.625,0.608707
2,0.517,0.513102,0.7585,0.763171,0.7585,0.758133
3,0.3572,0.363552,0.8475,0.847828,0.8475,0.84734
4,0.3365,0.319395,0.8715,0.871497,0.8715,0.871472
5,0.3207,0.309292,0.8745,0.874729,0.8745,0.874403


Training time: 920.27 seconds

Running experiment with: Rank: 8, Target Matrices: ['attention.query', 'attention.key'], LoRA Dropout: 0.2
Model has 11,711,236 total parameters
Model has 26,114 trainable parameters
0.22% of the parameters are trainable
GPU memory allocated: 61.86 MB


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.6615,0.659641,0.618,0.634161,0.618,0.599636
2,0.5539,0.542307,0.741,0.745163,0.741,0.74066
3,0.392,0.395847,0.827,0.826976,0.827,0.826981
4,0.362,0.339497,0.8595,0.859486,0.8595,0.859476
5,0.3313,0.326023,0.869,0.869467,0.869,0.868848


Training time: 919.86 seconds

Running experiment with: Rank: 8, Target Matrices: ['attention.query', 'attention.key', 'attention.value'], LoRA Dropout: 0.1
Model has 11,723,524 total parameters
Model has 38,402 trainable parameters
0.33% of the parameters are trainable
GPU memory allocated: 61.91 MB


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.5765,0.551631,0.737,0.741676,0.737,0.734622
2,0.2809,0.315389,0.8705,0.872249,0.8705,0.87051
3,0.2499,0.271304,0.8885,0.888497,0.8885,0.888498
4,0.2583,0.261163,0.8995,0.899493,0.8995,0.899495
5,0.2682,0.257992,0.9015,0.901527,0.9015,0.901508


Training time: 939.10 seconds

Running experiment with: Rank: 8, Target Matrices: ['attention.query', 'attention.key', 'attention.value'], LoRA Dropout: 0.2
Model has 11,723,524 total parameters
Model has 38,402 trainable parameters
0.33% of the parameters are trainable
GPU memory allocated: 62.05 MB


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.5854,0.563911,0.7265,0.732359,0.7265,0.72343
2,0.2955,0.322439,0.869,0.870501,0.869,0.869019
3,0.257,0.27525,0.8865,0.886515,0.8865,0.886506
4,0.2668,0.263273,0.8985,0.898497,0.8985,0.898498
5,0.2776,0.260102,0.9005,0.900497,0.9005,0.900498


Training time: 938.76 seconds

Running experiment with: Rank: 16, Target Matrices: ['attention.query'], LoRA Dropout: 0.1
Model has 11,735,812 total parameters
Model has 50,690 trainable parameters
0.43% of the parameters are trainable
GPU memory allocated: 62.00 MB


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.2603,0.249009,0.901,0.901,0.901,0.901
2,0.1732,0.24926,0.9075,0.907882,0.9075,0.907526
3,0.1756,0.245058,0.907,0.907128,0.907,0.906954
4,0.2167,0.241744,0.912,0.912002,0.912,0.911987
5,0.2463,0.240815,0.9115,0.911548,0.9115,0.911472


Training time: 939.35 seconds

Running experiment with: Rank: 16, Target Matrices: ['attention.query'], LoRA Dropout: 0.2
Model has 11,735,812 total parameters
Model has 50,690 trainable parameters
0.43% of the parameters are trainable
GPU memory allocated: 62.14 MB


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.214,0.242858,0.912,0.912779,0.912,0.912025
2,0.144,0.250011,0.9135,0.914109,0.9135,0.913525
3,0.1497,0.244209,0.9125,0.912531,0.9125,0.912477
4,0.1892,0.242756,0.913,0.913003,0.913,0.912987
5,0.2273,0.245166,0.913,0.913106,0.913,0.912962


Training time: 940.65 seconds

Running experiment with: Rank: 16, Target Matrices: ['attention.query', 'attention.key'], LoRA Dropout: 0.1
Model has 11,748,100 total parameters
Model has 62,978 trainable parameters
0.54% of the parameters are trainable
GPU memory allocated: 62.23 MB


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.2408,0.240759,0.9105,0.910583,0.9105,0.910514
2,0.1659,0.254935,0.9005,0.902198,0.9005,0.900511
3,0.1671,0.242638,0.9125,0.912517,0.9125,0.912481
4,0.1971,0.241928,0.91,0.91001,0.91,0.909982
5,0.2066,0.242431,0.91,0.910163,0.91,0.90995


Training time: 941.02 seconds

Running experiment with: Rank: 16, Target Matrices: ['attention.query', 'attention.key'], LoRA Dropout: 0.2
Model has 11,748,100 total parameters
Model has 62,978 trainable parameters
0.54% of the parameters are trainable
GPU memory allocated: 62.38 MB


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.2358,0.24006,0.908,0.908518,0.908,0.908027
2,0.1664,0.24531,0.911,0.911708,0.911,0.911025
3,0.1614,0.241163,0.914,0.91404,0.914,0.913975
4,0.1908,0.240904,0.91,0.910022,0.91,0.909978
5,0.2035,0.241561,0.9125,0.91262,0.9125,0.912459


Training time: 940.74 seconds

Running experiment with: Rank: 16, Target Matrices: ['attention.query', 'attention.key', 'attention.value'], LoRA Dropout: 0.1
Model has 11,760,388 total parameters
Model has 75,266 trainable parameters
0.64% of the parameters are trainable
GPU memory allocated: 62.47 MB


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.5839,0.560658,0.729,0.73398,0.729,0.726332
2,0.3011,0.32168,0.867,0.868315,0.867,0.867025
3,0.2448,0.274331,0.894,0.894043,0.894,0.893964
4,0.2743,0.264743,0.8955,0.895521,0.8955,0.895472
5,0.2546,0.261431,0.8965,0.896521,0.8965,0.896473


Training time: 940.95 seconds

Running experiment with: Rank: 16, Target Matrices: ['attention.query', 'attention.key', 'attention.value'], LoRA Dropout: 0.2
Model has 11,760,388 total parameters
Model has 75,266 trainable parameters
0.64% of the parameters are trainable
GPU memory allocated: 62.61 MB


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.5941,0.573172,0.712,0.717945,0.712,0.708477
2,0.3244,0.329228,0.868,0.868702,0.868,0.868038
3,0.2555,0.280044,0.89,0.890081,0.89,0.889952
4,0.2823,0.269371,0.8955,0.895536,0.8955,0.895467
5,0.2595,0.26525,0.8965,0.8965,0.8965,0.896482


Training time: 940.79 seconds
time: 3h 5min 59s (started: 2025-01-06 11:09:23 +00:00)


In [10]:
# Testing evaluations saved
results_df_phase_2 = pd.DataFrame(results_phase_2)
results_df_phase_2.to_csv("7_FT_ALBERT_Experiments_FixedTrainingHyp.csv", index=False)

time: 12.3 ms (started: 2025-01-06 14:17:28 +00:00)
