##### **Installing dependencies**

In [1]:
!pip install ipython-autotime gdown evaluate accelerate bitsandbytes peft loralib huggingface_hub transformers peft

Collecting ipython-autotime
  Downloading ipython_autotime-0.3.2-py2.py3-none-any.whl.metadata (1.4 kB)
Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting bitsandbytes
  Downloading bitsandbytes-0.45.0-py3-none-manylinux_2_24_x86_64.whl.metadata (2.9 kB)
Collecting peft
  Downloading peft-0.14.0-py3-none-any.whl.metadata (13 kB)
Collecting loralib
  Downloading loralib-0.1.2-py3-none-any.whl.metadata (15 kB)
Collecting huggingface_hub
  Downloading huggingface_hub-0.27.1-py3-none-any.whl.metadata (13 kB)
Downloading ipython_autotime-0.3.2-py2.py3-none-any.whl (7.0 kB)
Downloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading bitsandbytes-0.45.0-py3-none-manylinux_2_24_x86_64.whl (69.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m69.1/69.1 MB[0m [31m27.4 MB/s[0m eta [36m0:00:00[0m:00:01[0m

##### **Importing dependencies**

In [2]:
%load_ext autotime
import pandas as pd
import numpy as np
import nltk
import os
import zipfile
import tarfile
import re
import gdown
import gzip
import shutil
import wandb
import time
import torch
import psutil

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report, precision_recall_fscore_support
from datasets import Dataset

from transformers import (
    AutoTokenizer,
    AutoModelForSeq2SeqLM,
    AutoModelForCausalLM,
    AutoModelForSequenceClassification,
    DistilBertTokenizerFast,
    DistilBertForSequenceClassification,
    RobertaTokenizerFast, 
    RobertaForSequenceClassification,
    GPT2TokenizerFast, 
    GPT2ForSequenceClassification,
    GenerationConfig,
    TrainingArguments,
    Trainer,
    pipeline,
    BitsAndBytesConfig,
    DataCollatorForSeq2Seq,
    DataCollatorWithPadding,
    AdamW,
    get_scheduler
)
import torch
from torch.utils.data import DataLoader, Dataset
from tqdm import tqdm
import time
import evaluate
from peft import (
    LoraConfig,
    get_peft_model,
    TaskType,
    PeftModel,
    PeftConfig,
)
from huggingface_hub import login
import kagglehub

# from nltk.corpus import stopwords
# from nltk import word_tokenize
# from nltk.stem import WordNetLemmatizer
# from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
# from sklearn.metrics import accuracy_score
# from sklearn.naive_bayes import MultinomialNB
# from sklearn.linear_model import LogisticRegression
# from sklearn.neighbors import KNeighborsClassifier
# from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
# from google.colab import files
# from scipy.sparse import hstack
# from gensim.models import Word2Vec

import warnings

# Suppress specific warnings
warnings.filterwarnings("ignore", message=".*clean_up_tokenization_spaces.*")
warnings.filterwarnings("ignore", message="Some weights of DistilBertForSequenceClassification were not initialized.*")
warnings.filterwarnings("ignore", message=".*evaluation_strategy.*")
warnings.filterwarnings("ignore", message=".*gather along dimension 0.*")

time: 16.8 s (started: 2025-01-06 12:36:59 +00:00)


In [3]:
# Disable wandb Logging
os.environ["WANDB_MODE"] = "disabled"
wandb.init()

# Check for GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda
time: 5.65 s (started: 2025-01-06 12:37:24 +00:00)


##### **Supporting functions**

In [4]:
def clean_review(review):
    review = re.sub(r'<.*?>', '', review)
    review = re.sub(r'http\S+|www\S+|https\S+', '', review, flags=re.MULTILINE)
    review = review.strip()
    return review

def preprocess_function(examples):
    inputs = tokenizer(examples["review"], truncation=True, padding=True, max_length=512)
    inputs["labels"] = [1 if label.lower() == "positive" else 0 for label in examples["sentiment"]]
    return inputs

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    acc = accuracy_score(labels, preds)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average="weighted")
    return {"accuracy": acc, "precision": precision, "recall": recall, "f1": f1}

time: 686 µs (started: 2025-01-06 12:37:29 +00:00)


##### **Loading data**

In [5]:
train_df_full = pd.read_csv("/kaggle/input/imdb-dataset-final-1/train.csv")
# train_df = train_df_full.sample(n=3000, random_state=42)
train_df = train_df_full.copy()
train_df['review'] = train_df['review'].apply(clean_review)
train_df.reset_index(drop=True, inplace=True)

time: 1.32 s (started: 2025-01-06 12:37:33 +00:00)


In [6]:
test_df_full = pd.read_csv("/kaggle/input/imdb-dataset-final-1/test.csv")
# test_df = test_df_full.sample(n=2000, random_state=42)
test_df = test_df_full.copy()
test_df['review'] = test_df['review'].apply(clean_review)
test_df.reset_index(drop=True, inplace=True)

time: 786 ms (started: 2025-01-06 12:37:35 +00:00)


In [7]:
from datasets import Dataset

train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)

time: 561 ms (started: 2025-01-06 12:37:35 +00:00)


### **DistilBERT Final**

In [8]:
model_checkpoint = "distilbert-base-uncased"
tokenizer = DistilBertTokenizerFast.from_pretrained(model_checkpoint)
model = DistilBertForSequenceClassification.from_pretrained(model_checkpoint, num_labels=2).to(device)

tokenized_train = train_dataset.map(preprocess_function, batched=True)
tokenized_test = test_dataset.map(preprocess_function, batched=True)

print(f"Model is running on device: {model.device}")

# Finalized hyperparameters and LoRA configurations
final_batch_size = 16
final_learning_rate = 1e-4
final_epochs = 5
final_dropout = 0.1
final_rank = 16
final_target_matrices = ["attention.q_lin", "attention.k_lin"]
final_lora_dropout = 0.2
final_lora_alpha = 16

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/30000 [00:00<?, ? examples/s]

Map:   0%|          | 0/20000 [00:00<?, ? examples/s]

Model is running on device: cuda:0
time: 49 s (started: 2025-01-06 12:37:40 +00:00)


In [9]:
results = []

final_lora_config = LoraConfig(
    r=final_rank,
    lora_alpha=final_lora_alpha,
    target_modules=final_target_matrices,
    lora_dropout=final_lora_dropout,
    task_type="SEQ_CLS"
)

model_with_lora = get_peft_model(model, final_lora_config)

num_parameters = sum(p.numel() for p in model_with_lora.parameters())
trainable_parameters = sum(p.numel() for p in model_with_lora.parameters() if p.requires_grad)
trainable_percentage = (trainable_parameters / num_parameters) * 100

print(f"Model has {num_parameters:,} total parameters")
print(f"Model has {trainable_parameters:,} trainable parameters")
print(f"{trainable_percentage:.2f}% of the parameters are trainable")

if torch.cuda.is_available():
    torch.cuda.empty_cache()
    gpu_memory = torch.cuda.memory_allocated() / 1024**2  # in MB
    print(f"GPU memory allocated: {gpu_memory:.2f} MB")

wandb.config.update({"model/num_parameters": model.num_parameters()}, allow_val_change=True)

output_dir = "./final_model_output"
training_args = TrainingArguments(
    output_dir=output_dir,
    evaluation_strategy="epoch",
    learning_rate=final_learning_rate,
    per_device_train_batch_size=final_batch_size,
    per_device_eval_batch_size=final_batch_size,
    num_train_epochs=final_epochs,
    weight_decay=0.01,
    save_total_limit=1,
    save_strategy="epoch",
    logging_dir="./logs",
    logging_steps=10,
    load_best_model_at_end=True,
    push_to_hub=False  # We'll handle the push manually
)

trainer = Trainer(
    model=model_with_lora,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_test,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

start_time = time.time()
trainer.train()
metrics = trainer.evaluate()
end_time = time.time()

elapsed_time = end_time - start_time
print(f"Training time: {elapsed_time:.2f} seconds")
print(metrics)

model_with_lora.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)

results.append({
    "Model": "DistilBERT",
    "Batch Size": final_batch_size,
    "Epochs": final_epochs,
    "Learning Rate": final_learning_rate,
    "Rank": final_rank,
    "Alpha": final_lora_alpha,
    "LoRA Dropout": final_lora_dropout,
    "Target Matrices": final_target_matrices,
    "Accuracy": metrics["eval_accuracy"],
    "Precision": metrics["eval_precision"],
    "Recall": metrics["eval_recall"],
    "F1-Score": metrics["eval_f1"]
})

Model has 67,842,052 total parameters
Model has 887,042 trainable parameters
1.31% of the parameters are trainable
GPU memory allocated: 259.88 MB




Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.2085,0.224321,0.9091,0.90911,0.9091,0.909098
2,0.2276,0.211446,0.9151,0.915106,0.9151,0.915101
3,0.2201,0.209494,0.9163,0.916504,0.9163,0.916283
4,0.1961,0.206469,0.91725,0.917442,0.91725,0.917234
5,0.1785,0.206528,0.9182,0.918266,0.9182,0.918193


Training time: 4676.36 seconds
{'eval_loss': 0.20646850764751434, 'eval_accuracy': 0.91725, 'eval_precision': 0.9174420591316873, 'eval_recall': 0.91725, 'eval_f1': 0.9172339475551818, 'eval_runtime': 219.3898, 'eval_samples_per_second': 91.162, 'eval_steps_per_second': 2.849, 'epoch': 5.0}
time: 1h 17min 57s (started: 2025-01-06 12:38:42 +00:00)


In [10]:
results = pd.DataFrame(results)
results.to_csv("5_FT_DistilBERT_Final.csv", index=False)

time: 5.86 ms (started: 2025-01-06 13:56:48 +00:00)


In [11]:
from huggingface_hub import HfApi

api = HfApi()

repo_id = "annayah925/distilbert-imdb-finetuned"
folder_path = output_dir

api.upload_folder(
    repo_id=repo_id,
    folder_path=folder_path,
    commit_message="Final fine-tuned DistilBERT model for IMDb Sentiment Analysis",
    token="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"
)

print(f"Model successfully pushed to Hugging Face Hub at: https://huggingface.co/{repo_id}")

rng_state.pth:   0%|          | 0.00/14.2k [00:00<?, ?B/s]

adapter_model.safetensors:   0%|          | 0.00/3.55M [00:00<?, ?B/s]

optimizer.pt:   0%|          | 0.00/7.12M [00:00<?, ?B/s]

Upload 6 LFS files:   0%|          | 0/6 [00:00<?, ?it/s]

adapter_model.safetensors:   0%|          | 0.00/3.55M [00:00<?, ?B/s]

scheduler.pt:   0%|          | 0.00/1.06k [00:00<?, ?B/s]

training_args.bin:   0%|          | 0.00/5.18k [00:00<?, ?B/s]

Model successfully pushed to Hugging Face Hub at: https://huggingface.co/annayah925/distilbert-imdb-finetuned
time: 8.59 s (started: 2025-01-06 13:59:46 +00:00)
