##### **Installing dependencies**

In [None]:
!pip install ipython-autotime gdown evaluate accelerate bitsandbytes peft loralib huggingface_hub transformers peft

##### **Importing dependencies**

In [None]:
%load_ext autotime
import pandas as pd
import numpy as np
import nltk
import os
import zipfile
import tarfile
import re
import gdown
import gzip
import shutil
import wandb
import time
import torch
import psutil

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report, precision_recall_fscore_support
from datasets import Dataset

from transformers import (
    AutoTokenizer,
    AutoModelForSeq2SeqLM,
    AutoModelForCausalLM,
    AutoModelForSequenceClassification,
    DistilBertTokenizerFast,
    DistilBertForSequenceClassification,
    RobertaTokenizerFast, 
    RobertaForSequenceClassification,
    GPT2TokenizerFast, 
    GPT2ForSequenceClassification,
    AlbertTokenizer, 
    AlbertForSequenceClassification,
    GenerationConfig,
    TrainingArguments,
    Trainer,
    pipeline,
    BitsAndBytesConfig,
    DataCollatorForSeq2Seq,
    DataCollatorWithPadding,
    AdamW,
    get_scheduler
)
import torch
from torch.utils.data import DataLoader, Dataset
from tqdm import tqdm
import time
import evaluate
from peft import (
    LoraConfig,
    get_peft_model,
    TaskType,
    PeftModel,
    PeftConfig,
)
from huggingface_hub import login
import kagglehub

# from nltk.corpus import stopwords
# from nltk import word_tokenize
# from nltk.stem import WordNetLemmatizer
# from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
# from sklearn.metrics import accuracy_score
# from sklearn.naive_bayes import MultinomialNB
# from sklearn.linear_model import LogisticRegression
# from sklearn.neighbors import KNeighborsClassifier
# from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
# from google.colab import files
# from scipy.sparse import hstack
# from gensim.models import Word2Vec

import warnings

# Suppress specific warnings
warnings.filterwarnings("ignore", message=".*clean_up_tokenization_spaces.*")
warnings.filterwarnings("ignore", message="Some weights of DistilBertForSequenceClassification were not initialized.*")
warnings.filterwarnings("ignore", message="Some weights of AlbertForSequenceClassification were not initialized.*")
warnings.filterwarnings("ignore", message=".*evaluation_strategy.*")
warnings.filterwarnings("ignore", message=".*gather along dimension 0.*")

In [3]:
# Disable wandb Logging
os.environ["WANDB_MODE"] = "disabled"
wandb.init()

# Check for GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda
time: 5.9 s (started: 2025-01-06 02:19:58 +00:00)


##### **Supporting functions**

In [4]:
def clean_review(review):
    review = re.sub(r'<.*?>', '', review)
    review = re.sub(r'http\S+|www\S+|https\S+', '', review, flags=re.MULTILINE)
    review = review.strip()
    return review

def preprocess_function(examples):
    inputs = tokenizer(examples["review"], truncation=True, padding=True, max_length=512)
    inputs["labels"] = [1 if label.lower() == "positive" else 0 for label in examples["sentiment"]]
    return inputs

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    acc = accuracy_score(labels, preds)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average="weighted")
    return {"accuracy": acc, "precision": precision, "recall": recall, "f1": f1}

time: 769 µs (started: 2025-01-06 02:20:06 +00:00)


##### **Loading data**

In [5]:
train_df_full = pd.read_csv("/kaggle/input/imdb-dataset-3/train.csv")
train_df = train_df_full.sample(n=3000, random_state=42)
train_df['review'] = train_df['review'].apply(clean_review)
train_df.reset_index(drop=True, inplace=True)

time: 963 ms (started: 2025-01-06 02:20:11 +00:00)


In [6]:
test_df_full = pd.read_csv("/kaggle/input/imdb-dataset-3/test.csv")
test_df = test_df_full.sample(n=2000, random_state=42)
test_df['review'] = test_df['review'].apply(clean_review)
test_df.reset_index(drop=True, inplace=True)

time: 643 ms (started: 2025-01-06 02:20:14 +00:00)


In [7]:
from datasets import Dataset

train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)

time: 70.2 ms (started: 2025-01-06 02:20:18 +00:00)


### **Experimentations for ALBERT - Phase 1:** keeping LoRA hyperparams fixed

In [8]:
model_checkpoint = "albert-base-v2"
tokenizer = AlbertTokenizer.from_pretrained(model_checkpoint)
model = AlbertForSequenceClassification.from_pretrained(model_checkpoint, num_labels=2).to(device)

tokenized_train = train_dataset.map(preprocess_function, batched=True)
tokenized_test = test_dataset.map(preprocess_function, batched=True)

# Fixed LoRA parameters
rank = 8 
target_matrices = ["attention.query", "attention.key", "attention.value"]
lora_alpha = 16
lora_dropout = 0.1

# Changing hyperparams for batch size, epochs and learning rates
batch_sizes = [4, 8]
epochs_list = [3, 5]
learning_rates = [3e-5, 1e-4]

training_dropout = 0.1 # Fixed

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/760k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.31M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/684 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/47.4M [00:00<?, ?B/s]

Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at albert-base-v2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/3000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

time: 13.8 s (started: 2025-01-06 02:20:26 +00:00)


In [9]:
print(f"Model is running on device: {model.device}")

Model is running on device: cuda:0
time: 518 µs (started: 2025-01-06 02:20:43 +00:00)


In [10]:
results_phase_1 = []

for batch_size in batch_sizes:
    for epochs in epochs_list:
        for learning_rate in learning_rates:
            lora_config = LoraConfig(
                r=rank,
                lora_alpha=lora_alpha,
                target_modules=target_matrices,
                lora_dropout=lora_dropout,
                task_type="SEQ_CLS"
            )

            model_with_lora = get_peft_model(model, lora_config)
            
            start_time = time.time()
            print(f"\nRunning experiment with: Batch Size: {batch_size}, Epochs: {epochs}, Learning Rate: {learning_rate}")

            num_parameters = sum(p.numel() for p in model_with_lora.parameters())
            trainable_parameters = sum(p.numel() for p in model_with_lora.parameters() if p.requires_grad)
            trainable_percentage = (trainable_parameters / num_parameters) * 100
            
            print(f"Model has {num_parameters:,} total parameters")
            print(f"Model has {trainable_parameters:,} trainable parameters")
            print(f"{trainable_percentage:.2f}% of the parameters are trainable")

            if torch.cuda.is_available():
                torch.cuda.empty_cache()
                gpu_memory = torch.cuda.memory_allocated() / 1024**2  # in MB
                print(f"GPU memory allocated: {gpu_memory:.2f} MB")

            wandb.config.update({"model/num_parameters": model.num_parameters()}, allow_val_change=True)

            output_dir = f"./results_phase1_r{rank}_alpha{lora_alpha}_drop{lora_dropout}_targets{'_'.join(target_matrices)}_bs{batch_size}_epochs{epochs}_lr{learning_rate}"
            training_args = TrainingArguments(
                output_dir=output_dir,
                evaluation_strategy="epoch",
                learning_rate=learning_rate,
                per_device_train_batch_size=batch_size,
                per_device_eval_batch_size=batch_size,
                num_train_epochs=epochs,
                weight_decay=0.01,
                save_total_limit=1,
                save_strategy="epoch",
                logging_dir="./logs",
                logging_steps=10,
                load_best_model_at_end=True,
            )

            trainer = Trainer(
                model=model_with_lora,
                args=training_args,
                train_dataset=tokenized_train,
                eval_dataset=tokenized_test,
                tokenizer=tokenizer,
                compute_metrics=compute_metrics
            )

            trainer.train()
            metrics = trainer.evaluate()

            end_time = time.time()
            elapsed_time = end_time - start_time
            print(f"Training time: {elapsed_time:.2f} seconds")

            results_phase_1.append({
                "Model": "ALBERT",
                "Batch Size": batch_size,
                "Epochs": epochs,
                "Learning Rate": learning_rate,
                "Rank": rank,
                "Alpha": lora_alpha,
                "LoRA Dropout": lora_dropout,
                "Target Matrices": target_matrices,
                "Accuracy": metrics["eval_accuracy"],
                "Precision": metrics["eval_precision"],
                "Recall": metrics["eval_recall"],
                "F1-Score": metrics["eval_f1"]                
            })


Running experiment with: Batch Size: 16, Epochs: 3, Learning Rate: 3e-05
Model has 11,723,524 total parameters
Model has 38,402 trainable parameters
0.33% of the parameters are trainable
GPU memory allocated: 44.73 MB




Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.6824,0.690203,0.5225,0.522046,0.5225,0.522163
2,0.6936,0.67984,0.576,0.576184,0.576,0.576071
3,0.6847,0.674141,0.586,0.58675,0.586,0.580149


Training time: 616.59 seconds

Running experiment with: Batch Size: 16, Epochs: 3, Learning Rate: 0.0001
Model has 11,723,524 total parameters
Model has 38,402 trainable parameters
0.33% of the parameters are trainable
GPU memory allocated: 62.05 MB


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.6463,0.649045,0.6235,0.666646,0.6235,0.58958
2,0.5296,0.518256,0.8035,0.807859,0.8035,0.80328
3,0.4288,0.430748,0.842,0.842068,0.842,0.841913


Training time: 629.46 seconds

Running experiment with: Batch Size: 16, Epochs: 5, Learning Rate: 3e-05
Model has 11,723,524 total parameters
Model has 38,402 trainable parameters
0.33% of the parameters are trainable
GPU memory allocated: 62.05 MB


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.6821,0.688815,0.524,0.523218,0.524,0.523255
2,0.6864,0.674546,0.601,0.60749,0.601,0.598616
3,0.6621,0.657121,0.6605,0.669557,0.6605,0.658221
4,0.6491,0.641224,0.6835,0.684145,0.6835,0.682267
5,0.6383,0.636038,0.693,0.693585,0.693,0.69194


Training time: 1015.05 seconds

Running experiment with: Batch Size: 16, Epochs: 5, Learning Rate: 0.0001
Model has 11,723,524 total parameters
Model has 38,402 trainable parameters
0.33% of the parameters are trainable
GPU memory allocated: 62.05 MB


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.6381,0.633368,0.6725,0.707261,0.6725,0.653722
2,0.4308,0.410343,0.8515,0.851488,0.8515,0.851492
3,0.3122,0.32119,0.878,0.882043,0.878,0.877416
4,0.282,0.285743,0.893,0.89314,0.893,0.892941
5,0.3226,0.279729,0.8955,0.896281,0.8955,0.89535


Training time: 1015.84 seconds

Running experiment with: Batch Size: 32, Epochs: 3, Learning Rate: 3e-05
Model has 11,723,524 total parameters
Model has 38,402 trainable parameters
0.33% of the parameters are trainable
GPU memory allocated: 62.05 MB


OutOfMemoryError: Caught OutOfMemoryError in replica 0 on device 0.
Original Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/torch/nn/parallel/parallel_apply.py", line 84, in _worker
    output = module(*input, **kwargs)
  File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1553, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
  File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1562, in _call_impl
    return forward_call(*args, **kwargs)
  File "/usr/local/lib/python3.10/dist-packages/peft/peft_model.py", line 1521, in forward
    return self.base_model(
  File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1553, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
  File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1562, in _call_impl
    return forward_call(*args, **kwargs)
  File "/usr/local/lib/python3.10/dist-packages/peft/tuners/tuners_utils.py", line 197, in forward
    return self.model.forward(*args, **kwargs)
  File "/usr/local/lib/python3.10/dist-packages/transformers/models/albert/modeling_albert.py", line 1059, in forward
    outputs = self.albert(
  File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1553, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
  File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1562, in _call_impl
    return forward_call(*args, **kwargs)
  File "/usr/local/lib/python3.10/dist-packages/transformers/models/albert/modeling_albert.py", line 719, in forward
    encoder_outputs = self.encoder(
  File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1553, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
  File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1562, in _call_impl
    return forward_call(*args, **kwargs)
  File "/usr/local/lib/python3.10/dist-packages/transformers/models/albert/modeling_albert.py", line 468, in forward
    layer_group_output = self.albert_layer_groups[group_idx](
  File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1553, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
  File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1562, in _call_impl
    return forward_call(*args, **kwargs)
  File "/usr/local/lib/python3.10/dist-packages/transformers/models/albert/modeling_albert.py", line 420, in forward
    layer_output = albert_layer(hidden_states, attention_mask, head_mask[layer_index], output_attentions)
  File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1553, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
  File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1562, in _call_impl
    return forward_call(*args, **kwargs)
  File "/usr/local/lib/python3.10/dist-packages/transformers/models/albert/modeling_albert.py", line 383, in forward
    attention_output = self.attention(hidden_states, attention_mask, head_mask, output_attentions)
  File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1553, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
  File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1562, in _call_impl
    return forward_call(*args, **kwargs)
  File "/usr/local/lib/python3.10/dist-packages/transformers/models/albert/modeling_albert.py", line 318, in forward
    attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))
torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 384.00 MiB. GPU 0 has a total capacity of 14.74 GiB of which 164.12 MiB is free. Process 2475 has 14.58 GiB memory in use. Of the allocated memory 14.32 GiB is allocated by PyTorch, and 60.33 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)


time: 54min 39s (started: 2025-01-06 02:20:44 +00:00)


In [15]:
# Testing evaluations saved
results_df_phase_1 = pd.DataFrame(results_phase_1)
results_df_phase_1.to_csv("7_FT_ALBERT_Experiments_FixedLoRA.csv", index=False)

time: 5.66 ms (started: 2025-01-05 20:13:30 +00:00)


### **Experimentations for ALBERT - Phase 2:** changing LoRA hyperparameters

In [10]:
model_checkpoint = "albert-base-v2"
tokenizer = AlbertTokenizer.from_pretrained(model_checkpoint)
model = AlbertForSequenceClassification.from_pretrained(model_checkpoint, num_labels=2).to(device)

tokenized_train = train_dataset.map(preprocess_function, batched=True)
tokenized_test = test_dataset.map(preprocess_function, batched=True)

# Fixed parameters for batch size and epochs, etc
fixed_batch_size = 16
fixed_epochs = 5
fixed_learning_rate = 1e-4
training_dropout = 0.1

# LoRA parameter combinations
ranks = [8, 16]
target_matrices_list = [["attention.query"], ["attention.query", "attention.key"], ["attention.query", "attention.key", "attention.value"]]
lora_alpha = 16
lora_dropouts = [0.1, 0.2]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/3000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

time: 5.69 s (started: 2025-01-05 20:30:07 +00:00)


In [11]:
results_phase_2 = []

for rank in ranks:
    for target_matrices in target_matrices_list:
        for lora_dropout in lora_dropouts:
            lora_config = LoraConfig(
                r=rank,
                lora_alpha=lora_alpha,  # Fixed lora_alpha
                target_modules=target_matrices,
                lora_dropout=lora_dropout,
                task_type="SEQ_CLS"
            )

            model_with_lora = get_peft_model(model, lora_config)

            start_time = time.time()
            print(f"\nRunning experiment with: Rank: {rank}, Target Matrices: {target_matrices}, LoRA Dropout: {lora_dropout}")

            num_parameters = sum(p.numel() for p in model_with_lora.parameters())
            trainable_parameters = sum(p.numel() for p in model_with_lora.parameters() if p.requires_grad)
            trainable_percentage = (trainable_parameters / num_parameters) * 100
            
            print(f"Model has {num_parameters:,} total parameters")
            print(f"Model has {trainable_parameters:,} trainable parameters")
            print(f"{trainable_percentage:.2f}% of the parameters are trainable")

            if torch.cuda.is_available():
                torch.cuda.empty_cache()
                gpu_memory = torch.cuda.memory_allocated() / 1024**2  # in MB
                print(f"GPU memory allocated: {gpu_memory:.2f} MB")

            wandb.config.update({"model/num_parameters": model.num_parameters()}, allow_val_change=True)

            output_dir = f"./results_phase2_r{rank}_alpha{lora_alpha}_drop{lora_dropout}_targets{'_'.join(target_matrices)}_bs{fixed_batch_size}_epochs{fixed_epochs}_lr{fixed_learning_rate}"
            training_args = TrainingArguments(
                output_dir=output_dir,
                evaluation_strategy="epoch",
                learning_rate=fixed_learning_rate,
                per_device_train_batch_size=fixed_batch_size,
                per_device_eval_batch_size=fixed_batch_size,
                num_train_epochs=fixed_epochs,
                weight_decay=0.01,
                save_total_limit=1,
                save_strategy="epoch",
                logging_dir="./logs",
                logging_steps=10,
                load_best_model_at_end=True,
            )

            trainer = Trainer(
                model=model_with_lora,
                args=training_args,
                train_dataset=tokenized_train,
                eval_dataset=tokenized_test,
                tokenizer=tokenizer,
                compute_metrics=compute_metrics
            )

            trainer.train()
            metrics = trainer.evaluate()

            end_time = time.time()
            elapsed_time = end_time - start_time
            print(f"Training time: {elapsed_time:.2f} seconds")

            results_phase_2.append({
                "Model": "ALBERT",
                "Batch Size": fixed_batch_size,
                "Epochs": fixed_epochs,
                "Learning Rate": fixed_learning_rate,
                "Rank": rank,
                "Alpha": lora_alpha,  # Fixed alpha
                "LoRA Dropout": lora_dropout,
                "Target Matrices": target_matrices,
                "Accuracy": metrics["eval_accuracy"],
                "Precision": metrics["eval_precision"],
                "Recall": metrics["eval_recall"],
                "F1-Score": metrics["eval_f1"]
            })


Running experiment with: Rank: 8, Target Matrices: ['attention.q_lin'], LoRA Dropout: 0.1
Model has 67,620,868 total parameters
Model has 665,858 trainable parameters
0.98% of the parameters are trainable
GPU memory allocated: 259.04 MB




Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.494,0.42815,0.845,0.84994,0.845,0.844044
2,0.2297,0.27921,0.8875,0.890226,0.8875,0.887476
3,0.3116,0.275927,0.894,0.895165,0.894,0.894024
4,0.2714,0.274655,0.8925,0.892545,0.8925,0.892511
5,0.2618,0.27374,0.891,0.891101,0.891,0.891018


Training time: 454.02 seconds

Running experiment with: Rank: 8, Target Matrices: ['attention.q_lin'], LoRA Dropout: 0.2
Model has 67,620,868 total parameters
Model has 665,858 trainable parameters
0.98% of the parameters are trainable
GPU memory allocated: 282.91 MB


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.499,0.435013,0.844,0.849444,0.844,0.84296
2,0.2313,0.280165,0.8875,0.889971,0.8875,0.887485
3,0.3083,0.276426,0.894,0.895081,0.894,0.894025
4,0.2729,0.275097,0.893,0.893021,0.893,0.893007
5,0.2635,0.274169,0.8915,0.891565,0.8915,0.891514


Training time: 467.03 seconds

Running experiment with: Rank: 8, Target Matrices: ['attention.q_lin', 'attention.k_lin'], LoRA Dropout: 0.1
Model has 67,694,596 total parameters
Model has 739,586 trainable parameters
1.09% of the parameters are trainable
GPU memory allocated: 283.82 MB


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.4441,0.357172,0.87,0.870645,0.87,0.869818
2,0.2173,0.2855,0.89,0.890774,0.89,0.890031
3,0.2738,0.278292,0.8945,0.89499,0.8945,0.89453
4,0.2482,0.27788,0.892,0.892012,0.892,0.891974
5,0.2498,0.275829,0.893,0.892992,0.893,0.892988


Training time: 479.16 seconds

Running experiment with: Rank: 8, Target Matrices: ['attention.q_lin', 'attention.k_lin'], LoRA Dropout: 0.2
Model has 67,694,596 total parameters
Model has 739,586 trainable parameters
1.09% of the parameters are trainable
GPU memory allocated: 284.66 MB


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.4469,0.359739,0.8695,0.870174,0.8695,0.869312
2,0.2162,0.285174,0.892,0.892846,0.892,0.892029
3,0.2772,0.278282,0.895,0.895463,0.895,0.89503
4,0.2533,0.277743,0.891,0.891012,0.891,0.890974
5,0.251,0.275947,0.891,0.891002,0.891,0.890979


Training time: 480.66 seconds

Running experiment with: Rank: 8, Target Matrices: ['attention.q_lin', 'attention.k_lin', 'attention.v_lin'], LoRA Dropout: 0.1
Model has 67,768,324 total parameters
Model has 813,314 trainable parameters
1.20% of the parameters are trainable
GPU memory allocated: 284.94 MB


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.357,0.298979,0.883,0.883022,0.883,0.883007
2,0.2172,0.279719,0.885,0.887652,0.885,0.884978
3,0.2959,0.264674,0.895,0.895054,0.895,0.895013
4,0.2455,0.264079,0.8965,0.896504,0.8965,0.896502
5,0.235,0.262653,0.8975,0.897501,0.8975,0.897482


Training time: 489.69 seconds

Running experiment with: Rank: 8, Target Matrices: ['attention.q_lin', 'attention.k_lin', 'attention.v_lin'], LoRA Dropout: 0.2
Model has 67,768,324 total parameters
Model has 813,314 trainable parameters
1.20% of the parameters are trainable
GPU memory allocated: 285.78 MB


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.3603,0.298756,0.883,0.883009,0.883,0.883004
2,0.2161,0.280613,0.8855,0.888219,0.8855,0.885475
3,0.2985,0.264825,0.894,0.894054,0.894,0.894013
4,0.2473,0.263963,0.896,0.89602,0.896,0.896007
5,0.2379,0.262672,0.8965,0.896495,0.8965,0.896486


Training time: 489.57 seconds

Running experiment with: Rank: 16, Target Matrices: ['attention.q_lin'], LoRA Dropout: 0.1
Model has 67,842,052 total parameters
Model has 887,042 trainable parameters
1.31% of the parameters are trainable
GPU memory allocated: 285.50 MB


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.2948,0.273457,0.893,0.8931,0.893,0.893018
2,0.1891,0.274529,0.888,0.889729,0.888,0.88801
3,0.2682,0.264465,0.8975,0.897734,0.8975,0.897428
4,0.2144,0.262977,0.8975,0.897537,0.8975,0.897468
5,0.2122,0.26342,0.897,0.897178,0.897,0.896937


Training time: 490.31 seconds

Running experiment with: Rank: 16, Target Matrices: ['attention.q_lin'], LoRA Dropout: 0.2
Model has 67,842,052 total parameters
Model has 887,042 trainable parameters
1.31% of the parameters are trainable
GPU memory allocated: 286.35 MB


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.2733,0.274624,0.9,0.900031,0.9,0.899971
2,0.1714,0.271528,0.896,0.897001,0.896,0.896026
3,0.2408,0.271992,0.8965,0.897358,0.8965,0.896342
4,0.1878,0.264806,0.9045,0.904708,0.9045,0.904439
5,0.1943,0.265618,0.9055,0.905749,0.9055,0.905433


Training time: 490.72 seconds

Running experiment with: Rank: 16, Target Matrices: ['attention.q_lin', 'attention.k_lin'], LoRA Dropout: 0.1
Model has 67,915,780 total parameters
Model has 960,770 trainable parameters
1.41% of the parameters are trainable
GPU memory allocated: 286.91 MB


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.2689,0.267425,0.8915,0.892309,0.8915,0.89153
2,0.1823,0.279032,0.89,0.892176,0.89,0.889996
3,0.2696,0.260321,0.898,0.898009,0.898,0.898003
4,0.2042,0.263646,0.9005,0.900502,0.9005,0.900483
5,0.1961,0.267671,0.903,0.903406,0.903,0.902909


Training time: 491.24 seconds

Running experiment with: Rank: 16, Target Matrices: ['attention.q_lin', 'attention.k_lin'], LoRA Dropout: 0.2
Model has 67,915,780 total parameters
Model has 960,770 trainable parameters
1.41% of the parameters are trainable
GPU memory allocated: 287.75 MB


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.2486,0.264657,0.8995,0.899511,0.8995,0.899478
2,0.1849,0.280248,0.892,0.894065,0.892,0.892
3,0.2662,0.259999,0.901,0.90102,0.901,0.901006
4,0.2009,0.266241,0.904,0.904,0.904,0.903985
5,0.1928,0.27034,0.904,0.904462,0.904,0.903903


Training time: 490.96 seconds

Running experiment with: Rank: 16, Target Matrices: ['attention.q_lin', 'attention.k_lin', 'attention.v_lin'], LoRA Dropout: 0.1
Model has 67,989,508 total parameters
Model has 1,034,498 trainable parameters
1.52% of the parameters are trainable
GPU memory allocated: 288.32 MB


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.3548,0.298614,0.882,0.88199,0.882,0.881991
2,0.2171,0.284569,0.8815,0.88534,0.8815,0.881424
3,0.2919,0.262738,0.8945,0.894544,0.8945,0.894511
4,0.2447,0.261674,0.895,0.894997,0.895,0.894984
5,0.2336,0.260096,0.8955,0.895536,0.8955,0.895467


Training time: 491.55 seconds

Running experiment with: Rank: 16, Target Matrices: ['attention.q_lin', 'attention.k_lin', 'attention.v_lin'], LoRA Dropout: 0.2
Model has 67,989,508 total parameters
Model has 1,034,498 trainable parameters
1.52% of the parameters are trainable
GPU memory allocated: 289.16 MB


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.3595,0.298737,0.882,0.88199,0.882,0.881987
2,0.216,0.284842,0.881,0.885072,0.881,0.880913
3,0.2934,0.26275,0.895,0.895075,0.895,0.895015
4,0.2463,0.261745,0.895,0.895,0.895,0.895
5,0.2359,0.260355,0.8945,0.894535,0.8945,0.894467


Training time: 490.92 seconds
time: 1h 36min 46s (started: 2025-01-05 20:30:18 +00:00)


In [12]:
# Testing evaluations saved
results_df_phase_2 = pd.DataFrame(results_phase_2)
results_df_phase_2.to_csv("7_FT_ALBERT_Experiments_FixedTrainingHyp.csv", index=False)

time: 6.25 ms (started: 2025-01-05 22:15:32 +00:00)
