<a href="https://colab.research.google.com/github/IdanKanat/COVID_NLP_Advanced_DL_Project/blob/main/AdvancedTopicsDL_Project_IdanKanat%26IdoShahar_COVID_NLP_21.8.2025.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## **Imports**

In [None]:
!pip install transformers
!pip install optuna
!pip install wandb
!pip install evaluate

In [None]:
!pip install huggingface_hub

In [None]:
# Relevant imports:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from google.colab import drive
drive.mount('/content/drive')
import shutil
from google.colab import files

import os
import json
import torch
from torch.utils.data import DataLoader, Dataset
import torch.nn.functional as F
from torch.optim import AdamW
from torch import nn, optim
import evaluate
from transformers import AutoTokenizer, AutoModelForSequenceClassification, DataCollatorWithPadding, TrainingArguments, Trainer, EarlyStoppingCallback, get_scheduler
from torch.nn.utils import prune

import optuna
import wandb
from datasets import Dataset, DatasetDict, load_from_disk, Value, Sequence, concatenate_datasets

from sklearn.model_selection import train_test_split
from pathlib import Path
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score


In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

# **Part A - Exploratory Data Analysis (EDA)**

#### Data Path (Relevant for running the files not from Drive) - **PLEASE FIRST DOWNLOAD THE [Project_COVID_NLP folder](https://drive.google.com/drive/folders/1egGGJ6F878xIk_bKUfjhyZStESiliwRC?usp=sharing) accessible from idankanat@gmail.com's Google drive!!**

In [None]:
# Basic Drive path we used for this project. Assuming Google Colab exists as well as mounting files to drive, user can change it accordingly as he downloads the Project_COVID_NLP folder as specified in the project_root below and documented above.
basic_drive_path = "/content/drive/MyDrive" # USER CAN CHANGE IT IF HE DOESN'T WORK IN DRIVE AND DOWNLOADS FROM DRIVE THE Project_COVID_NLP folder!!
project_root = f"{basic_drive_path}/Project_COVID_NLP" # Root project folder
data_path = f"{project_root}/data"

In [None]:
# Loading the Corona_NLP_train dataset:
df = pd.read_csv(f"{data_path}/Corona_NLP_train.csv", encoding='latin1')
df.head(10)

In [None]:
wandb.login()

### **Sentiments Distribution**

In [None]:
# Count sentiment frequencies
sentiment_counts = df['Sentiment'].value_counts()

# Define the custom order
custom_order = ['Extremely Negative', 'Negative', 'Neutral', 'Positive', 'Extremely Positive']

# Reindex according to desired order
sentiment_counts = sentiment_counts.reindex(custom_order)

# Format long labels to be multi-line
sentiment_counts.index = sentiment_counts.index.str.replace("Extremely Positive", "Extremely\nPositive")
sentiment_counts.index = sentiment_counts.index.str.replace("Extremely Negative", "Extremely\nNegative")

# Plotting the general sentiment distribution
ax = sentiment_counts.plot(kind='bar', color='blue', edgecolor='black')

# Add title and labels
plt.title("Sentiment Distribution", fontweight='bold')
plt.xlabel("Sentiment")
plt.ylabel("Number of Tweets")
plt.xticks(rotation=0)

# Add top margin so numbers don't touch the edge
plt.ylim(0, sentiment_counts.max() + 1500)

# Add bold value labels for each sentiment category, with comma formatting
for i, count in enumerate(sentiment_counts):
    if pd.notna(count):
        plt.text(i, count + 200, f"{int(count):,}", ha='center', va='bottom', fontsize=10, fontweight = "bold")

# Show the plot
plt.tight_layout()
plt.show()

From the sentiment distribution shown above, we can draw a few conclusions:
1. **There are more positive tweets than negative tweets.**
2. **There are more extremely positive tweets than extremely negative tweets.** This ensures that even when combining the extremes of each sentiment, positive tweets outnumber negatives. The gap between positive and negative tweets enlarges as we add the extremes of each group.

### **Daily Tweet Counts**

In [None]:
# Standardize the 'TweetAt' date column:
df['TweetAt'] = pd.to_datetime(df['TweetAt'], dayfirst=False, errors='coerce')
df = df.dropna(subset=['TweetAt'])

# Create a new column 'YearMonth' for grouping by month
df['YearDay'] = df['TweetAt'].dt.date

# Add a column for tweet length:
df['TweetLength'] = df['OriginalTweet'].astype(str).apply(len)

# Define sentiment colors:
sentiment_order = ['Extremely Negative', 'Negative', 'Neutral', 'Positive', 'Extremely Positive']
colors = {
    'Extremely Negative': '#e74c3c',
    'Negative': '#e67e22',
    'Neutral': '#f1c40f',
    'Positive': '#2ecc71',
    'Extremely Positive': '#3498db',
    'All Tweets': 'gray'
}

# First, plotting all tweets:
fig, ax = plt.subplots(figsize=(10, 5))
day_counts_all = df.groupby('YearDay').size()
x_all = day_counts_all.index.astype(str)
y_all = day_counts_all.values
ax.plot(x_all, y_all, color='gray', marker='o', linewidth=2)
ax.set_title("Daily Tweet Counts", fontweight='bold', fontsize=14, pad=20)
ax.set_xlabel("Day")
ax.set_ylabel("Tweet Count")
ax.tick_params(axis='x', rotation=45)
# for spine in ax.spines.values():
    # spine.set_edgecolor('red')
    # spine.set_linewidth(3)
ax.grid(axis='y', linestyle='--', alpha=0.5)
plt.tight_layout()
plt.show()

# Second, plotting stratified sentiment trends in one plot:
fig, ax = plt.subplots(figsize=(12, 6))

for sentiment in sentiment_order:
    data = df[df['Sentiment'] == sentiment]
    day_counts = data.groupby('YearDay').size()
    x = day_counts.index.astype(str)
    y = day_counts.values
    ax.plot(x, y, label=sentiment, color=colors[sentiment], marker='o', linewidth=2)

ax.set_title("Daily Tweet Counts by Sentiment", fontweight='bold', fontsize=14, pad=20)
ax.set_xlabel("Day")
ax.set_ylabel("Tweet Count")
ax.tick_params(axis='x', rotation=45)
ax.grid(axis='y', linestyle='--', alpha=0.5)
ax.legend(title="Sentiment")
plt.tight_layout()
plt.show()

From the two plots above, we can conclude:
- As we could intuitively predict, there was a surge of tweets in March 2020 following the COVID-19 outburst.
- This massive surge in tweets wasn't attributed to any specific sentiment but rather all different sentiments indicated much more frequent tweets in March.

### **Tweet Length Distribution**

In [None]:
# Compute the number of characters in each tweet
df['TweetLength'] = df['OriginalTweet'].astype(str).str.len()

# Plotting a histogram of tweet lengths:
plt.figure(figsize=(8, 5))
plt.hist(df['TweetLength'], bins=50, color='purple', edgecolor='black')

plt.title("Distribution of Tweet Lengths (in characters)", fontweight='bold')
plt.xlabel("Number of Characters")
plt.ylabel("Number of Tweets")
plt.grid(axis='y', linestyle='--', alpha=0.7)

plt.tight_layout()
plt.show()

In [None]:
# Displaying summary statistics using the .describe() command:
length_stats = df['TweetLength'].describe().astype(int)
length_stats

From the tweet length distributions, several conclusions can be drawn:
- **Strong right skew up to the character limit -** There’s a visible increase in tweet counts as length increases, peaking around 240–280 characters.

- **A sharp drop after ~280 characters -** Reflects the Twitter character limit (likely 280) — tweets can't go longer, so the distribution is naturally cut off there.

In [None]:
from matplotlib.patches import Rectangle

# Sentiment labels and colors
sentiment_order = ['Extremely Negative', 'Negative', 'Neutral', 'Positive', 'Extremely Positive']
colors = {
    'Extremely Negative': '#e74c3c',
    'Negative': '#e67e22',
    'Neutral': '#f1c40f',
    'Positive': '#2ecc71',
    'Extremely Positive': '#3498db',
    'All Tweets': 'gray'
}

# Manual plot order with 'All Tweets' in the center
plot_order = [
    'Extremely Negative', 'All Tweets', 'Negative',
    'Extremely Positive',            'Neutral',   'Positive'
]

# Create 2x3 subplots
fig, axes = plt.subplots(nrows=2, ncols=3, figsize=(16, 10))
axes = axes.flatten()

# Plot each distribution
for i, label in enumerate(plot_order):
    if label == 'All Tweets':
        data = df['TweetLength']
    else:
        data = df[df['Sentiment'] == label]['TweetLength']

    axes[i].hist(data, bins=40, color=colors[label], edgecolor='black', alpha=0.9)
    axes[i].set_title(label, fontweight='bold')
    axes[i].set_xlabel("Tweet Length (characters)")
    axes[i].set_ylabel("Count")
    axes[i].grid(axis='y', linestyle='--', alpha=0.5)

    # Add bold border to the axes itself (cleaner than external patch)
    if label == 'All Tweets':
        for spine in axes[i].spines.values():
            spine.set_edgecolor('red')
            spine.set_linewidth(3)


# Title and layout
plt.suptitle("Tweet Length Distributions by Sentiment", fontsize=16, fontweight='bold')
plt.tight_layout(rect=[0, 0, 1, 0.95])
plt.show()

Comapring the stratified distribution charts above to the general tweet-length distribution, a few insights emerge:
1. The distributions of EACH of the non-neutral sentiments (i.e. both positive, negative, and extreme sentiments) seems to ***largely*** align with the general tweet length distribution - right skewed - i.e. a tail to the left. Long tweets are frequent.
2. The only distinctfully different stratified histogram is w.r.t to the ***neutral*** sentiment, where shorter tweet lengths are also common, as well as the longer tweets (which are frequent in the other histograms too).

In [None]:
# Ensure tweet lengths are computed
df['TweetLength'] = df['OriginalTweet'].astype(str).str.len()

# Define sentiment order + 'All'
all_labels = sentiment_order + ['All Tweets']

# Initialize dictionary to collect describe stats
summary_dict = {}

# Add describe() for each sentiment
for sentiment in sentiment_order:
    stats = df[df['Sentiment'] == sentiment]['TweetLength'].describe().astype(int)
    summary_dict[sentiment] = stats

# Add general (all tweets) stats
summary_dict['All Tweets'] = df['TweetLength'].describe().astype(int)

# Combine into a DataFrame
summary_df = pd.DataFrame(summary_dict)

# Optional: Reorder rows (metrics)
summary_df = summary_df.reindex(['count', 'mean', 'std', 'min', '25%', '50%', '75%', 'max'])

# Highlight function
def highlight_extremes(row):
    is_max = row == row.max()
    is_min = row == row.min()
    return ['background-color: lightgreen' if v else
            'background-color: salmon' if m else '' for v, m in zip(is_max, is_min)]

# Styling the dataframe w.r.t to row's maximum (green) & minimum (red)
styled_df = summary_df.style.apply(highlight_extremes, axis=1)

styled_df

This table displays the key statistics of each of the stratified distributions (w.r.t to sentiment), as well as the general tweet length distribution. We can observe:

- The longest tweet belongs to the extremely negative sentiment group (355 tokens!), the longest extremely positive tweet consisted of 338 tokens, indicating that tweets invoking extreme emotions appear to be longer and they're potentially POSITIVELY associated with length.
- The neutral sentiment distribution has the largest S.D, aligning with bigger spread than the other distributions, as described above in the graphs.

### **Tweets by Region**

In [None]:
# Load the location data
df['Location'] = df['Location'].fillna("").str.lower()  # Standardize by lowercasing all location values

# Group key-words by region:
region_keywords = {
    "US": [
        "usa", "u.s.a", "u.s", "america", "united states of america", "united states", "texas", "tx", "austin",
        "houston", "abilene", "new york", "new york city", "nyc", "ny", "california", "ca", "florida", "fl",
        "washington", "dc", "washington dc", "washington d.c.", "alaska", "chicago", "illinois", "arizona", "az",
        "atlanta", "ga", "baltimore", "boston", "brooklyn", "manhattan", "queens", "bronx", "staten island",
        "il", "nc", "nj", "va", "tn", "oh", "ohio", "sc", "co", "colorado", "detroit", "mi", "hollywood",
        "los angeles", "san fransisco", "honolulu", "hi", "indiana", "in", "kansas", "philadelphia", "pa",
        "phoenix", "me", "or", "portland", "oregon", "las vegas", "nv", "maryland", "nevada", "massachusetts",
        "miami", "michigan", "minneapolis", "nashville", "new orleans", "new jersey", "salt lake city", "ut",
        "utah", "slc", "san diego", "seattle", "silicon valley"
    ],
    "UK & Commonwealth": [
        "england", "uk", "u.k", "united kingdom", "london", "essex", "leeds", "liverpool", "manchester",
        "canada", "toronto", "ontario", "alberta", "british columbia", "montreal", "quebec", "ottawa", "vancouver",
        "australia", "south australia", "canberra", "melbourne", "sydney", "adelaide", "victoria",
        "new zealand", "auckland", "scotland", "aberdeen", "edinburgh", "glasgow", "ireland", "dublin"
    ],
    "Europe": [
        "netherlands", "amsterdam", "nederland", "holland", "the netherlands",
        "germany", "berlin", "frankfurt", "munich", "hamburg", "dusseldorf", "deutschland",
        "france", "paris", "belgium", "brussels", "switzerland", "geneva", "zurich",
        "spain", "barcelona", "madrid", "italy", "milan", "milano", "rome", "roma",
        "portugal", "lisbon", "austria", "vienna", "russia", "moscow", "st. petersburg"
    ],
    "Africa": [
        "south africa", "cape town", "johannesburg", "ghana", "accra", "nigeria", "lagos",
        "kenya", "uganda", "kampala"
    ],
    "Asia": [
        "india", "mumbai", "new delhi", "delhi", "bangalore", "hong kong", "singapore",
        "japan", "tokyo", "pakistan", "malaysia", "china", "shanghai",
        "united arab emirates", "united arab emirate", "abu dhabi", "uae", "dubai"
    ]
}

# Reverse mapping: from keyword to region
keyword_to_region = {
    keyword: region for region, keywords in region_keywords.items() for keyword in keywords
}

# Assigning region per location using this function:
def assign_region(location):
    for keyword, region in keyword_to_region.items():
        if keyword in location:
            return region
    return None

# Map the locations to their corresponding defined regions:
df['Region'] = df['Location'].apply(assign_region)

# Count tweets per region:
region_counts = df['Region'].value_counts().reset_index()
region_counts.columns = ['Region', 'TweetCount']

# Plotting the pie chart:
plt.figure(figsize=(9, 9))
wedges, texts, autotexts = plt.pie(
    region_counts['TweetCount'],
    labels=[f"{region} ({count})" for region, count in zip(region_counts['Region'], region_counts['TweetCount'])],
    autopct="%1.1f%%",
    startangle=140,
    pctdistance=0.65,       # Move percentage labels further inward
    labeldistance=1.15,     # Move labels further out
    textprops={'fontsize': 12}
)

plt.title("Tweet Distribution by Region", fontsize=14, pad=35)
plt.axis('equal')
plt.tight_layout()
plt.show()

The dataset contains 41,158 tweets, but there are 8,594 tweets without location values (~20%).


So, out of the remaining 32,564 tweets with location values, we analyzed the location distribution of **86% of them (28,095 tweets)** as shown in the above pie chart.

The other 14% were non-indicative locations (gibberish, small & irrelevant cities without countries mentioned, etc.)

### **Data Cleaning**

To reduce noise in the tweet content, we prepared the Corona_NLP dataset (train & test) for sentiment analysis by standardizing the tweet text. This included:

- Expanded English contractions (e.g., don’t → do not) to standardize wording.

- Replaced URLs and user mentions with placeholders, while simplifying hashtags.

- Removed unnecessary punctuation and normalized whitespace.

- Lowercased text to ensure consistency across tokens.

The clean versions of the train & test datasets with an added CleanTweet column, were saved as new CSV files for further modeling.

In [None]:
train_df = pd.read_csv(f"{data_path}/Corona_NLP_train.csv", encoding='latin1')
test_df = pd.read_csv(f"{data_path}/Corona_NLP_test.csv", encoding='latin1')

import re
from datetime import datetime

# Minimal set to avoid external libs. Non-destructive if not present.
# We will apply this function inside the next function (clean_tweet_sentiment_friendly)
def basic_contractions_expand(text: str) -> str:
    mapping = {
        "can't": "can not", "won't": "will not", "don't": "do not", "doesn't": "does not",
        "didn't": "did not", "i'm": "i am", "it's": "it is", "that's": "that is",
        "there's": "there is", "they're": "they are", "we're": "we are", "you're": "you are",
        "isn't": "is not", "aren't": "are not", "wasn't": "was not", "weren't": "were not",
        "shouldn't": "should not", "couldn't": "could not", "wouldn't": "would not",
        "i've": "i have", "we've": "we have", "they've": "they have", "who's": "who is",
        "what's": "what is", "let's": "let us", "i'll": "i will", "you'll": "you will",
        "he's": "he is", "she's": "she is"
    }
    # Replace using regex with word boundaries, case-insensitive
    def repl(m):
        s = m.group(0)
        return mapping.get(s.lower(), s)
    pattern = re.compile(r"\b(" + "|".join(map(re.escape, mapping.keys())) + r")\b", flags=re.IGNORECASE)
    return pattern.sub(repl, text)

#the main cleaning function of the dataset
def clean_tweet_sentiment_friendly(text: str) -> str:
    if pd.isna(text):
        return text
    t = text     # Preserve original sentiment cues as much as possible
    t = basic_contractions_expand(t)
    t = re.sub(r'http\S+|www\.\S+', ' URL ', t)             # URLs -> token
    t = re.sub(r'(?<=\s)RT\s+', ' ', t)                     # RT markers (Retweet sign) at word boundary
    t = re.sub(r'@\w+', ' @user ', t)                       # Mentions -> @user
    t = re.sub(r'#(\w+)', r'\1', t)                         # Remove hashtags but keep hashtag word
    t = re.sub(r"[\"$%^&*()\-_=+\[\]{};:|/\\<>]", " ", t)   # strip most punctuation and special characters, BESIDE ! and ?
    t = re.sub(r'\s+', ' ', t).strip()                      # Normalize whitespaces to only one whitespace
    t = t.lower()                                           # Lowercase
    return t


clean_train = train_df.copy()
clean_train["CleanTweet"] = clean_train["OriginalTweet"].apply(clean_tweet_sentiment_friendly)
clean_test = test_df.copy()
clean_test["CleanTweet"] = clean_test["OriginalTweet"].apply(clean_tweet_sentiment_friendly)
clean_train.to_csv(f"{data_path}/CLEAN_Corona_NLP_train.csv", index=False, encoding="utf-8")
clean_test.to_csv(f"{data_path}/CLEAN_Corona_NLP_test.csv",  index=False, encoding="utf-8")

# Basic inspections of the cleaned train & test datasets:
clean_train.head(200)
clean_test.head(200)

# **Part B - Training Pre-Trained HuggingFace models**

### **Data Splitting - Train and Validation**

We split the original training dataset into training and validation subsets, ensuring stratification which respects the original label / sentiment distributions, now in the new subsets. Besides, the new subsets contained only relevant info for classification, i.e. the cleaned tweet content and the labels / sentiments themselves.

In [None]:
# load the CLEAN datasets using ISO-8859-1 encoding due to UTF-8 decoding error
# We are loading the only two relevant columns for the classification (Sentiment label & Cleaned Tweet content)
train_df = pd.read_csv(f"{data_path}/CLEAN_Corona_NLP_train.csv", encoding='latin1',usecols=["Sentiment", "CleanTweet"] )
test_df = pd.read_csv(f"{data_path}/CLEAN_Corona_NLP_test.csv", encoding='latin1', usecols=["Sentiment", "CleanTweet"])

# Fixed label mapping for all of the data before splitting
label_order = ['Extremely Negative','Negative','Neutral','Positive','Extremely Positive']
label2id = {l:i for i,l in enumerate(label_order)}
id2label = {i:l for l,i in label2id.items()}

test_size = len(test_df)

# Split the training data to create a validation set of the same size as the test set, stratification was included to keep the same label distribution across the training & validation subsets
train_df_reduced, val_df = train_test_split(
    train_df,
    test_size=test_size, # Validation set's size equals the test set's size
    random_state=42,
    stratify=train_df['Sentiment'] # Stratified the subsets w.r.t labels - sentiments
)

train_df_reduced_size = len(train_df_reduced)
val_size = len(val_df)
total = train_df_reduced_size + val_size + test_size

# Create a summary table
summary = {
    "Dataset": ["Training after splitting", "Validation", "Test"],
    "Records": [train_df_reduced_size, val_size, test_size],
    "Percentage": [round(100 * train_df_reduced_size / total, 2),
                   round(100 * val_size / total, 2),
                   round(100 * test_size / total, 2)]
}


summary_df = pd.DataFrame(summary)
summary_df.head()

## **Importing 2 Models from HuggingFace (HF) - *RoBERTa-Base-Tweet* and *BERTweet-Base***

### **A Look at the Model (1)** - ***RoBERTa-Base-Tweet***

In [None]:
# Load tokenizer and model from Hugging Face
model_name = "cardiffnlp/twitter-roberta-base"
tokenizer_twitter_roberta_base = AutoTokenizer.from_pretrained(model_name)

# Load the first model from HuggingFace - ROBERTA Transformer Encoder, fine-tuned for sentiment analysis from tweets:
roberta_tweets_1_model = AutoModelForSequenceClassification.from_pretrained(
    "cardiffnlp/twitter-roberta-base", num_labels = 5 # 5 labels for the 5 sentiments
).to(device)
roberta_tweets_1_model # glancing at the model architecture

## **Helper Functions**

### **Tweet Dataset Class**

In [None]:
# Defining the TweetDataset class with 3 built in functions (init, len and getitem) for integration with the PyTorch DataLoader object
class TweetDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_length=128):
        """
        Args:
            dataframe (pd.DataFrame): DataFrame containing the data
            tokenizer: HuggingFace tokenizer for text processing
            max_length (int): Maximum sequence length
        """

        self.texts = dataframe['CleanTweet'].tolist()
        self.labels = dataframe['Sentiment'].map(label2id).tolist()
        self.tokenizer = tokenizer
        self.max_length = max_length


    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        return {"text": self.texts[idx], "label": self.labels[idx]}

### **Early Stopping Check**

In [None]:
# Check for early stopping, applied for regularization. If the relevant validation metric (accuracy) shows no observable
# improvement (w.r.t best observed val metric up until now) over several epochs consecutively, model training stops.
# This function outputs the best_val_accuracy, epoch & early stop flag for each epoch it's called
def early_stop_check(patience, best_val_accuracy, best_val_accuracy_epoch, current_val_accuracy, current_val_accuracy_epoch):
    early_stop_flag = False
    if current_val_accuracy > best_val_accuracy:
        best_val_accuracy = current_val_accuracy
        best_val_accuracy_epoch = current_val_accuracy_epoch
    else:
        if current_val_accuracy_epoch - best_val_accuracy_epoch > patience:
            early_stop_flag = True
    return best_val_accuracy, best_val_accuracy_epoch, early_stop_flag

## **Model Training**
    The train_model_with_hyperparams function trains the model using the given training and validation loaders,
    with early stopping.
    Logs training and validation performance to Weights & Biases (accuracy, precision, recall, F1-score, and confusion matrix).
    Returns the best model validation loss and saves the best model checkpoint per trial.

      Args:
        model (.from_pretrained): Transformer encoder model, imported from HuggingFace
        train_loader (DataLoader): DataLoader for training data
        val_loader (DataLoader): DataLoader for validation data
        optimizer (torch.optim.Optimizer): Optimizer
        criterion (nn.Module): Loss function
        epochs (int): Max number of epochs
        patience (int): Early stopping patience
        trial (optuna.trial.Trial): Current Optuna trial
    Returns:
        float: Best validation accuracy

In [None]:
def train_model_with_hyperparams(model, train_loader, val_loader, optimizer, criterion, epochs, patience, trial):
    # speed toggles (safe to call each time)
    torch.backends.cuda.matmul.allow_tf32 = True
    torch.backends.cudnn.allow_tf32 = True
    scaler = torch.cuda.amp.GradScaler()

    best_val_accuracy = 0.0 # Initialize best validation accuracy
    best_val_accuracy_epoch = 0 # Track epoch with the best validation accuracy
    early_stop_flag = False
    best_model_state = None # To save the best model (in each trial / final training)

    device_ = next(model.parameters()).device  # robust device grab

    for epoch in range(1, epochs + 1):
        model.train() # Enable training mode
        train_loss = 0.0 # Initializing the cumulative training loss for the current epoch to 0.
        total_train = 0 # Initialize total_train here
        correct_train = 0 # Initialize correct_train here

        train_preds = [] # Store predicted classes for metrics
        train_targets = []  # Store true labels for metrics

        for batch in train_loader: # Iterates over the train_loader, which is a DataLoader object containing batches of training data. Each iteration yields a batch of inputs (images) and corresponding labels (ground-truth classes).
            # Non-blocking H2D copies (works best with pin_memory=True on DataLoader)
            input_ids      = batch["input_ids"].to(device_, non_blocking=True)
            attention_mask = batch["attention_mask"].to(device_, non_blocking=True)
            labels         = batch["labels"].to(device_, non_blocking=True)

            optimizer.zero_grad(set_to_none=True) # Reset gradients


            # AMP forward/backward
            with torch.cuda.amp.autocast():
                outputs = model(input_ids=input_ids, attention_mask=attention_mask)
                logits  = outputs.logits
                loss    = criterion(logits, labels)

            scaler.scale(loss).backward()
            # (Optional) gradient clipping for extra stability:
            # scaler.unscale_(optimizer)
            # torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
            scaler.step(optimizer)
            scaler.update()


            # Compute metrics
            bs = labels.size(0)
            train_loss += loss.item() * bs
            total_train += bs

            preds = logits.argmax(dim=1)
            correct_train += (preds == labels).sum().item()
            train_preds.extend(preds.detach().cpu().numpy())
            train_targets.extend(labels.detach().cpu().numpy())

        train_loss /= max(total_train, 1)
        train_accuracy = correct_train / max(total_train, 1)
        train_f1 = f1_score(train_targets, train_preds, average='macro', zero_division=0)
        train_precision = precision_score(train_targets, train_preds, average='macro', zero_division=0)
        train_recall = recall_score(train_targets, train_preds, average='macro', zero_division=0)

        # Validation check
        if val_loader is not None:
            model.eval()
            val_loss_sum = 0.0
            total_val = 0
            correct_val = 0
            val_preds, val_targets = [], []

            with torch.no_grad():
                for batch in val_loader:
                    input_ids      = batch["input_ids"].to(device_, non_blocking=True)
                    attention_mask = batch["attention_mask"].to(device_, non_blocking=True)
                    labels         = batch["labels"].to(device_, non_blocking=True)

                    # AMP also speeds up eval
                    with torch.cuda.amp.autocast():
                        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
                        logits  = outputs.logits
                        loss    = criterion(logits, labels)

                    bs = labels.size(0)
                    val_loss_sum += loss.item() * bs
                    total_val += bs

                    preds = logits.argmax(dim=1)
                    correct_val += (preds == labels).sum().item()
                    val_preds.extend(preds.detach().cpu().numpy())
                    val_targets.extend(labels.detach().cpu().numpy())

            val_loss = val_loss_sum / max(total_val, 1)
            val_accuracy = correct_val / max(total_val, 1)
            val_precision = precision_score(val_targets, val_preds, average='macro', zero_division=0)
            val_recall = recall_score(val_targets, val_preds, average='macro', zero_division=0)
            val_f1 = f1_score(val_targets, val_preds, average='macro', zero_division=0)

            # Check for Early stopping (& updates best_val_accuracy & epoch)
            if patience is not None:
                best_val_accuracy, best_val_accuracy_epoch, early_stop_flag = early_stop_check(
                    patience, best_val_accuracy, best_val_accuracy_epoch, val_accuracy, epoch
                )

            # Save best-so-far weights (>= to handle ties)
            if val_accuracy >= best_val_accuracy and total_val > 0:
                best_model_state = {k: v.detach().cpu().clone() for k, v in model.state_dict().items()}

            # W & B logging (if active)
            if wandb.run is not None:
                wandb.log({
                    "Epoch": epoch,
                    "Train Loss": train_loss,
                    "Train Accuracy": train_accuracy,
                    "Train F1 Score": train_f1,
                    "Train Precision": train_precision,
                    "Train Recall": train_recall,
                    "Validation Loss": val_loss,
                    "Validation Accuracy": val_accuracy,
                    "Validation Precision": val_precision,
                    "Validation Recall": val_recall,
                    "Validation F1": val_f1,
                })

            if early_stop_flag:
                break

    # Save best model weights (if we ever improved)
    # if best_model_state is not None:
        # torch.save(best_model_state, f"best_model_trial_{trial.number}.pt")

    # Restore best weights into the model before returning best_val_accuracy
    if best_model_state is not None:
      model.load_state_dict(best_model_state)

    return best_val_accuracy

## **HP Tuning using the Objective function (without HuggingFace's Trainer)**

Optuna objective function for tuning the given Transformer encoder model on twitter data.

Each trial runs training with a different set of hyperparameters and logs key training & validation metrics to Weights & Biases.


In [None]:
# Objective Function for Optuna:
def objective(trial, architecture):

    # Initializing the model & tokenizer from HF, depending on the specified architecture:
    if architecture == "twitter-roberta-base":
        model_name = "cardiffnlp/twitter-roberta-base"
        model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels = 5).to(device) # initialize RoBerta for twitter from HF, num_labels=5 -> 5 sentiments.
        tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
        base_model = model.roberta
        pretokenized_dir = ("data/tokenized_twitter_roberta_base")  # the folder for saving the model
    else:
        model_name = "vinai/bertweet-base"
        model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels = 5).to(device) # initialize RoBerta for twitter from HF, num_labels=5 -> 5 sentiments.
        tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
        base_model = model.roberta
        pretokenized_dir = ("data/tokenized_bertweet_base")  # the folder for saving the model


    # Hyperparameter suggestions:
    learning_rate = trial.suggest_loguniform("learning_rate", 1e-5, 1e-3)
    weight_decay = trial.suggest_loguniform("weight_decay", 1e-6, 1e-4)
    patience = trial.suggest_int("patience", 7, 10)
    batch_size = trial.suggest_categorical("batch_size", [32, 64, 128])
    num_layers_finetune = trial.suggest_int("num_layers_finetune", 0, 3)

    # safety: correct dtypes + torch output
    ds = load_from_disk(pretokenized_dir) #Loads the Arrow-backed HF DatasetDict that are defines later on in the Pre-tokenization part
    for split in ds:
        ds[split] = ds[split].cast_column("input_ids", Sequence(Value("int64")))
        ds[split] = ds[split].cast_column("attention_mask", Sequence(Value("int64")))  # or "bool"
        ds[split] = ds[split].cast_column("labels", Value("int64"))
        ds[split].set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])

    # keep dynamic padding (no tokenization here—collator only pads per batch)
    collator = DataCollatorWithPadding(tokenizer, pad_to_multiple_of=8, return_tensors="pt")

    # build loaders from the pretokenized HF dataset
    train_loader = DataLoader(
        ds["train_reduced"], batch_size=batch_size, shuffle=True,
        collate_fn=collator, num_workers=4, pin_memory=True,
        persistent_workers=True, prefetch_factor=2
    )
    val_loader = DataLoader(
        ds["validation"], batch_size=min(2*batch_size, 128), shuffle=False,
        collate_fn=collator, num_workers=4, pin_memory=True,
        persistent_workers=True, prefetch_factor=2
    )

    #Freezing and Unfreezing layers
    for p in base_model.parameters():
        p.requires_grad = False
    if num_layers_finetune > 0:  # safety guard: avoid the "-0" edge case
        for p in base_model.encoder.layer[-num_layers_finetune:].parameters():
            p.requires_grad = True
    for p in model.classifier.parameters():
        p.requires_grad = True

    # Define optimizer and loss function
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=learning_rate, weight_decay=weight_decay)

    # Initialize Weights & Biases - the values in the config are the properties of each trial.
    wandb.init(project=f"{architecture}_CORONA_NLP_Twitter_Sentiment_Analysis_13.8.2025_FULL_HP_TUNING",
               entity = "idoshahar96-tel-aviv-university",
               config={
        "learning_rate": learning_rate,
        "weight_decay": weight_decay,
        "patience": patience,
        "batch_size": batch_size,
        "num_layers_finetune": num_layers_finetune,
        "architecture": architecture,
        "dataset": "CORONA-NLP-Train_Twitter-Sentiment-Analysis"},
        name=f"trial_{trial.number}") # The name that will be saved in the W&B platform

    # Train the model and get the best validation accuracy
    best_val_accuracy = train_model_with_hyperparams(model, train_loader, val_loader, optimizer, criterion, epochs=15, patience=patience, trial=trial)

    wandb.finish() # Finish the Weights & Biases run

    return best_val_accuracy # Return best validation accuracy as the objective to maximize

## **Pre-tokenization**

Tokenization is CPU-heavy. If we do it all over again in each of the Optuna trials, then re-tokenizing wastes time.
Pre-tokenization makes that cost zero for subsequent runs.

It includes:
running the tokenizer once over the whole dataset and applying truncation with a fixed ceiling MAX_LEN.
For each sample i, we store a variable-length vector called len_i, which will be the min(original_len_i, MAX_LEN).
Each saved sample can have a different length.
You save the result to disk (Arrow format) with the columns of input_ids, attention_mask, and labels
So, after this step, no trial needs to call the tokenizer and every trial just loads these IDs.

In this step we are doing padding at all:
The DataLoader pulls a batch from the disk. Then, the collator looks at the lengths in that batch, finds the longest sequence in each batch, and pads only up to this length.
This is dynamic padding: it happens per batch, at runtime, and never re-tokenizes—it only adds pad tokens so tensors in the batch share the same shape
The Dynamic padding keeps tensors tight to the batch’s real lengths → fewer pad tokens → fewer FLOPs in the model’s forward pass.

In [None]:
# Converting the sentiment labels into integers via label2id, and drops the original Sentiment column
train_df_reduced_ = train_df_reduced.assign(label=train_df_reduced["Sentiment"].map(label2id)).drop(columns=["Sentiment"])
val_df_ = val_df.assign(label=val_df["Sentiment"].map(label2id)).drop(columns=["Sentiment"])
test_df_ = test_df.assign(label=test_df["Sentiment"].map(label2id)).drop(columns=["Sentiment"])

# SANITY CHECK - to make sure our training works, we added this code to make sure the training works on little training & validation data (as well as few trials & epochs per trial).
# train_df_reduced_ = train_df_reduced_.sample(n=300, random_state=42)  # pick only 300 training rows
# val_df_ = val_df_.sample(n=100, random_state=42)                      # pick only 100 validation rows
# test_df_ = test_df_.sample(n=100, random_state=42)                    # optional: smaller test set too

# Converting the Pandas DataFrames to HuggingFace Datasets and wraping them in a DatasetDict
raw_ds = DatasetDict({
    "train_reduced": Dataset.from_pandas(train_df_reduced_, preserve_index=False),
    "validation": Dataset.from_pandas(val_df_, preserve_index=False),
    "test": Dataset.from_pandas(test_df_, preserve_index=False),
})


def pretokenize_one(model_name: str, save_dir: str):
    tok = AutoTokenizer.from_pretrained(model_name, use_fast=True)

    # compute a single cap once (same idea you used inside objective)
    enc_tmp  = tok(train_df_reduced["CleanTweet"].tolist(), truncation=False)
    lengths  = [len(x) for x in enc_tmp["input_ids"]]
    MAX_LEN  = max(64, min(int(np.percentile(lengths, 95)), 128))
    print(f"[{model_name}] MAX_LEN={MAX_LEN}")

    # tokenize (NO padding) and save
    tokenized = raw_ds.map(
        lambda b: tok(b["CleanTweet"], truncation=True, max_length=MAX_LEN, padding=False),
        batched=True, remove_columns=["CleanTweet"]
    )
    tokenized = tokenized.rename_column("label", "labels")
    tokenized.save_to_disk(save_dir)
    print(f"Saved to: {save_dir}")

# Run once per architecture you plan to use:
pretokenize_one("cardiffnlp/twitter-roberta-base", "data/tokenized_twitter_roberta_base")
pretokenize_one("vinai/bertweet-base",          "data/tokenized_bertweet_base")

### **Running the Models**

#### ***Model (1) - RoBERTa-Base-Tweet***

In [None]:
# Creating an Optuna Study - RoBERTa-Base-Tweet (rec4):
study = optuna.create_study(direction="maximize")  # Specifies that the goal of the optimization is to maximize the objective function - accuracy in our case.
study.optimize(lambda trial: objective(trial, "twitter-roberta-base"), n_trials=10) # Specified 10 trials

In [None]:
# Documenting best hyperparameter combination - first model - RoBERTa-Base-Tweet - Rec4 code:
study_roberta_base_tweet_rec4 = study
print("Best objective value (validation accuracy):", study.best_value)
print("The chosen HP combination:", study.best_params)
print("Trial number of the best objective (validation accuracy) value:", study.best_trial.number)

print("Best objective value (validation accuracy):", study_roberta_base_tweet_rec4.best_value)
print("The chosen HP combination:", study_roberta_base_tweet_rec4.best_params)
print("Trial number of the best objective (validation accuracy) value:", study_roberta_base_tweet_rec4.best_trial.number)

In [None]:
# Define the path to save the file in Google Drive with REC4 naming
# basic_drive_path = "/content/drive/MyDrive" # USER CAN CHANGE IT IF HE DOESN'T WORK IN DRIVE AND DOWNLOADS FROM DRIVE THE Project_COVID_NLP folder!! (under # but the hashtag sign # can be removed if needed)
project_root = f"{basic_drive_path}/Project_COVID_NLP" # Root project folder
hp_root = f"{project_root}/Model_HPs"
drive_path = f"{hp_root}/best_roberta_base_tweet_rec4_hyperparams.json"

with open(drive_path, "w") as f:
    # json.dump(study_bertweet_base_rec4.best_params, f)
    json.dump({'learning_rate': 0.0003834791389042033, 'weight_decay': 2.88286253103848e-06, 'patience': 7, 'batch_size': 128, 'num_layers_finetune': 3}, f) # Manually typed the best_params for future use

print(f"\nBest hyperparameters saved to {drive_path}")

#### ***Model (2) - BERTweet-Base***

In [None]:
# Load tokenizer and model from Hugging Face
model_name = "vinai/bertweet-base"
tokenizer_bertweet_base = AutoTokenizer.from_pretrained(model_name)

# Load the second model from HuggingFace - BERT-TWEET Transformer Encoder, fine-tuned for sentiment analysis from tweets:
bertweet_base_2_model = AutoModelForSequenceClassification.from_pretrained(
    "vinai/bertweet-base", num_labels = 5 # 5 labels for the 5 sentiments
).to(device)
bertweet_base_2_model # glancing at the model architecture

In [None]:
# Creating an Optuna Study - BERTweet-Base (rec4):
study_bertweet_base_rec4 = optuna.create_study(direction="maximize")  # Specifies that the goal of the optimization is to maximize the objective function - accuracy in our case.
study_bertweet_base_rec4.optimize(lambda trial: objective(trial, "bertweet-base"), n_trials=10) # Specified 10 trials

In [None]:
# Documenting best hyperparameter combination - Second Model - BERTweet-Base - Rec4 code:

# print("Best objective value (validation accuracy):", study_bertweet_base_rec4.best_value)
# print("The chosen HP combination:", study_bertweet_base_rec4.best_params)
# print("Trial number of the best objective (validation accuracy) value:", study_bertweet_base_rec4.best_trial.number)

# Define the path to save the file in Google Drive with REC4 naming
# basic_drive_path = "/content/drive/MyDrive" # USER CAN CHANGE IT IF HE DOESN'T WORK IN DRIVE AND DOWNLOADS FROM DRIVE THE Project_COVID_NLP folder!! (under # but the hashtag sign # can be removed if needed)
project_root = f"{basic_drive_path}/Project_COVID_NLP" # Root project folder
hp_root = f"{project_root}/Model_HPs"
drive_path = f"{hp_root}/best_bertweet_base_rec4_hyperparams.json"

with open(drive_path, "w") as f:
    # json.dump(study_bertweet_base_rec4.best_params, f)
    json.dump({'learning_rate': 0.0001184412471705182, 'weight_decay': 1.2699696348040995e-05, 'patience': 10, 'batch_size': 128, 'num_layers_finetune': 3}, f) # Manually typed the best_params for future use

print(f"\nBest hyperparameters saved to {drive_path}")

## **Final Training WITHOUT using HuggingFace functions (Trainer)**

After finding the best trial (hyperparameter combination) using the objective function, the `FINAL_train_model_with_hyperparams` is called for final model training using the obtained hyperparameter combination. It appears similar to the way we trained each model under each trial specification in the Optuna based objective function. This additional function supports model saving too, and generalized for each model architecture. It's worth noting that in practice, the validation dataset in this function would be the actual test set.

In [None]:
# basic_drive_path = "/content/drive/MyDrive" # USER CAN CHANGE IT IF HE DOESN'T WORK IN DRIVE AND DOWNLOADS FROM DRIVE THE Project_COVID_NLP folder!! (under # but the hashtag sign # can be removed if needed)
project_root = f"{basic_drive_path}/Project_COVID_NLP" # Root project folder

# Define model_root inside the project, for all trained weights
model_root = f"{project_root}/Model_Weights"

In [None]:
def FINAL_train_model_with_hyperparams(architecture, best_params, save_path):

    # Initializing the model & tokenizer from HF, depending on the specified architecture:
    if architecture == "twitter-roberta-base":
        model_name = "cardiffnlp/twitter-roberta-base"
        model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels = 5).to(device) # initialize RoBerta for twitter from HF, num_labels=5 -> 5 sentiments.
        tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
        base_model = model.roberta
        pretokenized_dir = ("data/tokenized_twitter_roberta_base")  # the folder for saving the model
    else:
        model_name = "vinai/bertweet-base"
        model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels = 5).to(device) # initialize RoBerta for twitter from HF, num_labels=5 -> 5 sentiments.
        tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
        base_model = model.roberta
        pretokenized_dir = ("data/tokenized_bertweet_base")  # the folder for saving the model

    # safety: correct dtypes + torch output
    ds = load_from_disk(pretokenized_dir) #Loads the Arrow-backed HF DatasetDict that are defines later on in the Pre-tokenization part
    for split in ds:
        ds[split] = ds[split].cast_column("input_ids", Sequence(Value("int64")))
        ds[split] = ds[split].cast_column("attention_mask", Sequence(Value("int64")))  # or "bool"
        ds[split] = ds[split].cast_column("labels", Value("int64"))
        ds[split].set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])

    # keep dynamic padding (no tokenization here—collator only pads per batch)
    collator = DataCollatorWithPadding(tokenizer, pad_to_multiple_of=8, return_tensors="pt")

    # Merge train + validation for final training
    full_train_dataset = concatenate_datasets([ds["train_reduced"], ds["validation"]])
    full_train_dataset = full_train_dataset.shuffle(seed=42) # Shuffle the model's training data to add randomness

    # build loaders from the pretokenized HF dataset
    train_loader = DataLoader(
        full_train_dataset, batch_size=best_params["batch_size"], shuffle=True,
        collate_fn=collator, num_workers=4, pin_memory=True,
        persistent_workers=True, prefetch_factor=2
    )
    val_loader = DataLoader(
        ds["test"], batch_size=min(2*best_params["batch_size"], 128), shuffle=False,
        collate_fn=collator, num_workers=4, pin_memory=True,
        persistent_workers=True, prefetch_factor=2
    )

    #Freezing and Unfreezing layers
    for p in base_model.parameters():
        p.requires_grad = False
    if best_params["num_layers_finetune"] > 0:  # safety guard: avoid the "-0" edge case
        for p in base_model.encoder.layer[-best_params["num_layers_finetune"]:].parameters():
            p.requires_grad = True
    for p in model.classifier.parameters():
        p.requires_grad = True

    # Define optimizer and loss function
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=best_params["learning_rate"], weight_decay=best_params["weight_decay"])

    if wandb.run is not None:
      wandb.finish() # Check if W&B doesn't run anything in parallel. If so, stop the pre-existing run.

    # Initialize Weights & Biases - the values in the config are the properties of each trial.
    wandb.init(project=f"{architecture}_CORONA_NLP_Twitter_Sentiment_Analysis_19.8.2025_FULL_TRAINING",
               entity = "idoshahar96-tel-aviv-university",
               config={
        "learning_rate": best_params["learning_rate"],
        "weight_decay": best_params["weight_decay"],
        "patience": best_params["patience"],
        "batch_size": best_params["batch_size"],
        "num_layers_finetune": best_params["num_layers_finetune"],
        "architecture": architecture,
        "dataset": "CORONA-NLP-Train_Twitter-Sentiment-Analysis"},
        name="FINAL_TRAINING", # The name that will be saved in the W&B platform
        reinit=True)

    # Train the model and get the best validation accuracy
    best_val_accuracy = train_model_with_hyperparams(model, train_loader, val_loader, optimizer, criterion, epochs=25, patience=best_params["patience"], trial=None)

    wandb.finish() # Finish the Weights & Biases run

   # Save model
    model.save_pretrained(save_path)
    tokenizer.save_pretrained(save_path)

In [None]:
# best_params = study_roberta_base_tweet_rec4.best_params  # get best HPs from the model's Optuna study (are under # but the hashtag sign # can be removed if needed, for the sake of manual inscription of best param, check the row below)
best_params = {'learning_rate': 0.0003834791389042033, 'weight_decay': 2.88286253103848e-06, 'patience': 7, 'batch_size': 128, 'num_layers_finetune': 3} # Manually typed the best_params for future use
name_path = "/best_model_roberta_base_tweet_rec4"
save_path = model_root + name_path # initialize & define save path for the model's weights

# Training the Model (1), using Optuna-study's best trial HPs - RoBERTa-Base-Tweet:
FINAL_train_model_with_hyperparams(architecture="twitter-roberta-base", best_params=best_params,save_path=save_path)

# Zip the whole model folder
shutil.make_archive(save_path, "zip", save_path)

# Download the zip to your computer
files.download(f"{save_path}.zip")

In [None]:
# best_params = study_bertweet_base_rec4.best_params  # get best HPs from the model's Optuna study (are under # but the hashtag sign # can be removed if needed, for the sake of manual inscription of best param, check the row below)
best_params = {'learning_rate': 0.0001184412471705182, 'weight_decay': 1.2699696348040995e-05, 'patience': 10, 'batch_size': 128, 'num_layers_finetune': 3} # Manually typed the best_params for future use
name_path = "/best_model_bertweet_base_rec4"
save_path = model_root + name_path # initialize & define save path for the model's weights

# Training the Model (2), using Optuna-study's best trial HPs - BERTweet-Base:
FINAL_train_model_with_hyperparams(architecture="bertweet-base", best_params=best_params,save_path=save_path)

# Zip the whole model folder
shutil.make_archive(save_path, "zip", save_path)

# Download the zip to your computer
files.download(f"{save_path}.zip")

## **HP Tuning using HuggingFace functions (Trainer)**

In [None]:
# Load evaluation metrics, using the evaluate library
accuracy_metric = evaluate.load("accuracy")
precision_metric = evaluate.load("precision")
recall_metric = evaluate.load("recall")
f1_metric = evaluate.load("f1")

# Compute metrics function for the Trainer
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    return {
        "accuracy": accuracy_metric.compute(predictions=preds, references=labels)["accuracy"],
        "precision": precision_metric.compute(predictions=preds, references=labels, average="macro")["precision"],
        "recall": recall_metric.compute(predictions=preds, references=labels, average="macro")["recall"],
        "f1": f1_metric.compute(predictions=preds, references=labels, average="macro")["f1"]
    }

In [None]:
# Objective function for Optuna hyperparameter tuning
def objective_HF(trial, architecture):

    # Initializing the model & tokenizer from HF, depending on the specified architecture:
    if architecture == "twitter-roberta-base":
        model_name = "cardiffnlp/twitter-roberta-base"
        pretokenized_dir = ("data/tokenized_twitter_roberta_base")  # the folder for saving the model
    else:
        model_name = "vinai/bertweet-base"
        pretokenized_dir = ("data/tokenized_bertweet_base")  # the folder for saving the model

    # Load model and tokenizer
    model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=5) # 5 labels for the 5 sentiments
    tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
    base_model = model.roberta # Base model for both models (RoBERTa-Base-Tweet & BERTweet-Base) - RoBERTa

    # Hyperparameter search space
    learning_rate = trial.suggest_loguniform("learning_rate", 1e-5, 1e-3)
    weight_decay = trial.suggest_loguniform("weight_decay", 1e-6, 1e-4)
    patience = trial.suggest_int("patience", 7, 10)
    batch_size = trial.suggest_categorical("batch_size", [32, 64, 128])
    num_layers_finetune = trial.suggest_int("num_layers_finetune", 0, 3)
    lr_scheduler_type = trial.suggest_categorical("lr_scheduler_type", ["linear", "cosine", "polynomial"])

    # safety: correct dtypes + torch output
    ds = load_from_disk(pretokenized_dir) #Loads the Arrow-backed HF DatasetDict that are defines later on in the Pre-tokenization part
    for split in ds:
        ds[split] = ds[split].cast_column("input_ids", Sequence(Value("int64")))
        ds[split] = ds[split].cast_column("attention_mask", Sequence(Value("int64")))  # or "bool"
        ds[split] = ds[split].cast_column("labels", Value("int64"))
        ds[split].set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])

    # keep dynamic padding (no tokenization here—collator only pads per batch)
    collator = DataCollatorWithPadding(tokenizer, pad_to_multiple_of=8, return_tensors="pt")

    # Freezing and Unfreezing layers
    for p in base_model.parameters():
        p.requires_grad = False
    if num_layers_finetune > 0:  # safety guard: avoid the "-0" edge case
        for p in base_model.encoder.layer[-num_layers_finetune:].parameters():
            p.requires_grad = True
    for p in model.classifier.parameters():
        p.requires_grad = True

    if wandb.run is not None:
      wandb.finish() # Check if W&B doesn't run anything in parallel. If so, stop the pre-existing run.

   # Initialize Weights & Biases - the values in the config are the properties of each trial.
    wandb.init(
        project=f"{architecture}_HF_CORONA_NLP_Twitter_Sentiment_Analysis_14.8.2025_FULL_HP_TUNING",
        entity="idoshahar96-tel-aviv-university",
        config={
            "learning_rate": learning_rate,
            "weight_decay": weight_decay,
            "patience": patience,
            "batch_size": batch_size,
            "num_layers_finetune": num_layers_finetune,
            "lr_scheduler_type": lr_scheduler_type,
            "architecture": architecture,
            "dataset": "CORONA-NLP-Train_Twitter-Sentiment-Analysis"
        },
        name=f"trial_{trial.number}",
        reinit=True
    )

    # TrainingArguments for the Hugging Face Trainer
    training_args = TrainingArguments(
        output_dir=f"HF-results/trial_{trial.number}",  # where checkpoints will be saved
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=batch_size,
        learning_rate=learning_rate,
        weight_decay=weight_decay,
        num_train_epochs=20,           # Setting the number of epochs for training - 20
        eval_strategy="epoch",        # evaluate at the end of each epoch
        save_strategy="epoch",        # save a checkpoint at the end of each epoch
        logging_strategy="epoch",     # log metrics at the end of each epoch
        load_best_model_at_end=True,  # reload the best checkpoint (based on metric_for_best_model)
        metric_for_best_model="accuracy", # optimize w.r.t accuracy
        greater_is_better=True,
        save_total_limit=1,           # keep only the best checkpoint
        report_to="wandb",            # log to Weights & Biases
        lr_scheduler_type=lr_scheduler_type
    )

    # Create Trainer instance
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=ds["train_reduced"],
        eval_dataset=ds["validation"],
        tokenizer=tokenizer,
        data_collator=collator,
        compute_metrics=compute_metrics,
        callbacks=[EarlyStoppingCallback(early_stopping_patience=patience)]
    )

    # Train the model
    trainer.train()

    # Save best trial results
    trainer.save_model(f"HF-results/trial_{trial.number}")  # ensures config.json + weights are there
    tokenizer.save_pretrained(f"HF-results/trial_{trial.number}")

    # Evaluate the best model on the validation set
    eval_metrics = trainer.evaluate()
    wandb.finish()

    # Optuna uses the validation accuracy as the optimization target
    acc = eval_metrics.get("eval_accuracy", 0.0)
    if np.isnan(acc):
        raise optuna.exceptions.TrialPruned()

    return acc

#### **RoBERTa-Base-Tweet:**

In [None]:
# Creating an Optuna Study - RoBERTa -Base-Tweet (rec5):
study_roberta_base_tweet_rec5 = optuna.create_study(direction="maximize")  # Specifies that the goal of the optimization is to maximize the objective function - accuracy in our case.
study_roberta_base_tweet_rec5.optimize(lambda trial: objective_HF(trial, "twitter-roberta-base"), n_trials=12) # Specified 12 trials

print("Best objective value (validation accuracy):", study_roberta_base_tweet_rec5.best_value)
print("The chosen HP combination:", study_roberta_base_tweet_rec5.best_params)
print("Trial number of the best objective (validation accuracy) value:", study_roberta_base_tweet_rec5.best_trial.number)

# Define the path to save the file in Google Drive with REC5 naming
# basic_drive_path = "/content/drive/MyDrive" # USER CAN CHANGE IT IF HE DOESN'T WORK IN DRIVE AND DOWNLOADS FROM DRIVE THE Project_COVID_NLP folder!! (under # but the hashtag sign # can be removed if needed)
project_root = f"{basic_drive_path}/Project_COVID_NLP" # Root project folder
hp_root = f"{project_root}/Model_HPs"
drive_path = f"{hp_root}/best_model_roberta_base_tweet_rec5_hyperparams.json"

with open(drive_path, "w") as f:
    json.dump(study_roberta_base_tweet_rec5.best_params, f)

print(f"\nBest hyperparameters saved to {drive_path}")

#### **BerTweet-Base:**

In [None]:
# Creating an Optuna Study - BerTweet-Base (rec5):
study_bertweet_base_rec5 = optuna.create_study(direction="maximize")  # Specifies that the goal of the optimization is to maximize the objective function - accuracy in our case.
study_bertweet_base_rec5.optimize(lambda trial: objective_HF(trial, "bertweet-base"), n_trials=12) # Specified 12 trials

print("Best objective value (validation accuracy):", study_bertweet_base_rec5.best_value)
print("The chosen HP combination:", study_bertweet_base_rec5.best_params)
print("Trial number of the best objective (validation accuracy) value:", study_bertweet_base_rec5.best_trial.number)

# Define the path to save the file in Google Drive with REC5 naming
# basic_drive_path = "/content/drive/MyDrive" # USER CAN CHANGE IT IF HE DOESN'T WORK IN DRIVE AND DOWNLOADS FROM DRIVE THE Project_COVID_NLP folder!! (under # but the hashtag sign # can be removed if needed)
project_root = f"{basic_drive_path}/Project_COVID_NLP" # Root project folder
hp_root = f"{project_root}/Model_HPs"
drive_path = f"{hp_root}/best_model_bertweet_base_rec5_hyperparams.json"

with open(drive_path, "w") as f:
    json.dump(study_bertweet_base_rec5.best_params, f)

print(f"\nBest hyperparameters saved to {drive_path}")

## **Final Training using HuggingFace (HF) functions**

After finding the best trial (hyperparameter combination) using the objective-HF function, the `train_model_with_hyperparams_HF` is called for final model training using the obtained hyperparameter combination. It appears similar to the way we trained each model under each trial specification in the Optuna based objective-HF function. This additional function supports model saving too, and generalized for each model architecture. It's worth noting that in practice, the validation dataset in this function would be the actual test set.

In [None]:
def train_model_with_hyperparams_HF(architecture, best_params, save_path):

    # Initializing the model & tokenizer from HF, depending on the specified architecture:
    if architecture == "twitter-roberta-base":
        model_name = "cardiffnlp/twitter-roberta-base"
        pretokenized_dir = ("data/tokenized_twitter_roberta_base")  # the folder for saving the model
    else:
        model_name = "vinai/bertweet-base"
        pretokenized_dir = ("data/tokenized_bertweet_base")  # the folder for saving the model

    # Load model and tokenizer
    model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=5) # 5 labels for the 5 sentiments
    tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
    base_model = model.roberta # Base model for both models (RoBERTa-Base-Tweet & BERTweet-Base) - RoBERTa

    # safety: correct dtypes + torch output
    ds = load_from_disk(pretokenized_dir) #Loads the Arrow-backed HF DatasetDict that are defines later on in the Pre-tokenization part
    for split in ds:
        ds[split] = ds[split].cast_column("input_ids", Sequence(Value("int64")))
        ds[split] = ds[split].cast_column("attention_mask", Sequence(Value("int64")))  # or "bool"
        ds[split] = ds[split].cast_column("labels", Value("int64"))
        ds[split].set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])

    # Merge train + validation for final training
    full_train_dataset = concatenate_datasets([ds["train_reduced"], ds["validation"]])
    full_train_dataset = full_train_dataset.shuffle(seed=42) # Shuffle the model's training data to add randomness

    # keep dynamic padding (no tokenization here—collator only pads per batch)
    collator = DataCollatorWithPadding(tokenizer, pad_to_multiple_of=8, return_tensors="pt")

    # Freezing and Unfreezing layers
    for p in base_model.parameters():
        p.requires_grad = False
    if best_params["num_layers_finetune"] > 0:  # safety guard: avoid the "-0" edge case
        for p in base_model.encoder.layer[-best_params["num_layers_finetune"]:].parameters():
            p.requires_grad = True
    for p in model.classifier.parameters():
        p.requires_grad = True

    if wandb.run is not None:
      wandb.finish() # Check if W&B doesn't run anything in parallel. If so, stop the pre-existing run.

   # Initialize Weights & Biases - the values in the config are the properties of the best trial found in the Optuna-HP-Tuning step.
    wandb.init(
        project=f"{architecture}_HF_CORONA_NLP_Twitter_Sentiment_Analysis_19.8.2025_FULL_TRAINING",
        entity="idoshahar96-tel-aviv-university",
        config={
            "learning_rate": best_params["learning_rate"],
            "weight_decay": best_params["weight_decay"],
            "patience": best_params["patience"],
            "batch_size": best_params["batch_size"],
            "num_layers_finetune": best_params["num_layers_finetune"],
            "lr_scheduler_type": best_params["lr_scheduler_type"],
            "architecture": architecture,
            "dataset": "CORONA-NLP-Train_Twitter-Sentiment-Analysis"
        },
        name="FINAL_TRAINING",
        reinit=True
    )

    # TrainingArguments for the Hugging Face Trainer
    training_args = TrainingArguments(
        output_dir=save_path,  # where checkpoints will be saved
        per_device_train_batch_size=best_params["batch_size"],
        per_device_eval_batch_size=best_params["batch_size"],
        learning_rate=best_params["learning_rate"],
        weight_decay=best_params["weight_decay"],
        num_train_epochs=25,           # Setting the number of epochs for training - 25
        eval_strategy="epoch",        # evaluate at the end of each epoch
        save_strategy="epoch",        # save a checkpoint at the end of each epoch
        logging_strategy="epoch",     # log metrics at the end of each epoch
        load_best_model_at_end=True,  # reload the best checkpoint (based on metric_for_best_model)
        metric_for_best_model="accuracy", # optimize w.r.t accuracy
        greater_is_better=True,
        save_total_limit=1,           # keep only the best checkpoint
        report_to="wandb",            # log to Weights & Biases
        lr_scheduler_type=best_params["lr_scheduler_type"]
    )

    # Create Trainer instance
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=full_train_dataset,
        eval_dataset=ds["test"], # Evaluating the model using the test dataset
        tokenizer=tokenizer,
        data_collator=collator,
        compute_metrics=compute_metrics,
        callbacks=[EarlyStoppingCallback(early_stopping_patience=best_params["patience"])]
    )

    # Train the model
    trainer.train()

    # Save model
    trainer.save_model(save_path)
    tokenizer.save_pretrained(save_path)

    wandb.finish()

#### **RoBERTa-Base-Tweet:**

In [None]:
# best_params = study_roberta_base_tweet_rec5.best_params  # get best HPs from the model's Optuna study
best_params = {'learning_rate': 0.0000860370374400373, 'weight_decay': 0.00008459884214639005, 'patience': 10, 'batch_size': 128, 'num_layers_finetune': 3, 'lr_scheduler_type': 'polynomial'} # Manually typed the best_params for future use
name_path = "/best_model_roberta_base_tweet_rec5"
save_path = model_root + name_path # initialize & define save path for the model's weights

# Training the Model (1), using Optuna-study's best trial HPs - RoBERTa-Base-Tweet:
train_model_with_hyperparams_HF(architecture="twitter-roberta-base", best_params=best_params,save_path=save_path)

# Zip the whole model folder
shutil.make_archive(save_path, "zip", save_path)

# Download the zip to your computer
files.download(f"{save_path}.zip")

#### **BerTweet-Base:**

In [None]:
# best_params = study_bertweet_base_rec5.best_params  # get best HPs from the model's Optuna study
best_params = {'learning_rate': 7.668855564109297e-05, 'weight_decay': 4.8978169582912055e-06, 'patience': 9, 'batch_size': 64, 'num_layers_finetune': 3, 'lr_scheduler_type': 'linear'} # Manually typed the best_params for future use
name_path = "/best_model_bertweet_base_rec5"
save_path = model_root + name_path # initialize & define save path for the model's weights

# Training the Model (2), using Optuna-study's best trial HPs - BERTweet-Base:
train_model_with_hyperparams_HF(architecture="bertweet-base", best_params=best_params,save_path=save_path)

# Zip the whole model folder
shutil.make_archive(save_path, "zip", save_path)

# Download the zip to your computer
files.download(f"{save_path}.zip")

# **Compression Techniques**

## **Technique (1) - Quantization**

As a model compression technique, **Quantization reduces model size and speeds up inference by converting weights to lower precision, quantizing them**. The function below applies **dynamic quantization (Post-Training) on a fine-tuned HF model** (which has gone through the final training phases above), evaluates it on the test set, and saves the quantized version.

In [None]:
# Critical roots
# basic_drive_path = "/content/drive/MyDrive" # USER CAN CHANGE IT IF HE DOESN'T WORK IN DRIVE AND DOWNLOADS FROM DRIVE THE Project_COVID_NLP folder!! (under # but the hashtag sign # can be removed if needed)
project_root = f"{basic_drive_path}/Project_COVID_NLP" # Root project folder
model_root   = f"{project_root}/Model_Weights"

# Define quant_root inside the project, for all quantized weights
quant_root = f"{project_root}/Quantized_Model_Weights"

In [None]:
# Helper function which evaluates model performance given a specific dataset (loader) - train / test:
def evaluate_model(model, loader, device):
    model.eval()
    all_preds, all_labels = [], []
    with torch.no_grad():
        for batch in loader:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            preds = outputs.logits.argmax(dim=1)

            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    # Load relevant metrics - Accuracy, F1-Score, Precision & Recall:
    metrics = {
        "accuracy": accuracy_score(all_labels, all_preds),
        "f1": f1_score(all_labels, all_preds, average="macro", zero_division=0),
        "precision": precision_score(all_labels, all_preds, average="macro", zero_division=0),
        "recall": recall_score(all_labels, all_preds, average="macro", zero_division=0)
    }

    return metrics

In [None]:
# This function quantizes a fine-tuned HF model, evaluates it on training & test sets, compares its performance with the previous model's, and saves the quantized version.
def quantize_evaluate_and_compare(model_name, model_name_dir, best_params):

  # Define original model path (trained weights) and quantized save path
    model_path     = f"{model_root}/{model_name_dir}"
    quantized_path = f"{quant_root}/{model_name_dir}_quantized"

    # Select correct pretokenized dataset
    if "roberta" in model_name_dir.lower():
        pretokenized_dir = "data/tokenized_twitter_roberta_base" # the folder for saving the model
    else:
        pretokenized_dir = "data/tokenized_bertweet_base" # the folder for saving the model

    # Load model & tokenizer - Initially on GPU but need to be moved to CPU before Quantization!
    model = AutoModelForSequenceClassification.from_pretrained(model_path).to(device)
    tokenizer = AutoTokenizer.from_pretrained(model_path)

    # safety: correct dtypes + torch output
    ds = load_from_disk(pretokenized_dir) #Loads the Arrow-backed HF DatasetDict that are defines later on in the Pre-tokenization part
    for split in ds:
        ds[split] = ds[split].cast_column("input_ids", Sequence(Value("int64")))
        ds[split] = ds[split].cast_column("attention_mask", Sequence(Value("int64")))  # or "bool"
        ds[split] = ds[split].cast_column("labels", Value("int64"))
        ds[split].set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])

    # keep dynamic padding (no tokenization here—collator only pads per batch)
    collator = DataCollatorWithPadding(tokenizer, pad_to_multiple_of=8, return_tensors="pt")

    # Merge train + validation for final training
    full_train_dataset = concatenate_datasets([ds["train_reduced"], ds["validation"]])
    full_train_dataset = full_train_dataset.shuffle(seed=42) # Shuffle the model's training data to add randomness

    # initialize loaders (train & test) from the pretokenized HF dataset
    train_loader = DataLoader(
        full_train_dataset, batch_size=best_params["batch_size"], shuffle=True,
        collate_fn=collator, num_workers=4, pin_memory=True,
        persistent_workers=True, prefetch_factor=2
    )
    test_loader = DataLoader(
        ds["test"], batch_size=min(2*best_params["batch_size"], 128), shuffle=False,
        collate_fn=collator, num_workers=4, pin_memory=True,
    )

    # Evaluate the performance of original model - first:
    train_original = evaluate_model(model, train_loader, device)
    test_original = evaluate_model(model, test_loader, device)

    # move original model to CPU for quantization post-evaluation
    model = model.to("cpu")

    # Apply dynamic quantization
    quantized_model = torch.quantization.quantize_dynamic(
        model, {torch.nn.Linear}, dtype=torch.qint8
    ).to("cpu")

    # Save quantized model's config & weights
    os.makedirs(quantized_path, exist_ok=True) # Create directory if it doesn't exist
    torch.save(quantized_model.state_dict(), os.path.join(quantized_path, "pytorch_model.bin"))
    tokenizer.save_pretrained(quantized_path)

    # Evaluate the performance of quantized model - second:
    train_quantized = evaluate_model(quantized_model, train_loader, torch.device("cpu"))
    test_quantized = evaluate_model(quantized_model, test_loader, torch.device("cpu"))

    # Count number of parameters in both models - original & quantized:
    original_params = sum(p.numel() for p in model.parameters())
    quantized_params = sum(p.numel() for p in quantized_model.parameters())

    # Collect results into a DataFrame
    results = pd.DataFrame([{
        "original_params": original_params,
        "quantized_params": quantized_params,
        "param_reduction": original_params - quantized_params,
        "param_ratio": quantized_params / original_params,
        # Accuracy
        "train_accuracy_original": train_original["accuracy"],
        "test_accuracy_original": test_original["accuracy"],
        "train_accuracy_quantized": train_quantized["accuracy"],
        "test_accuracy_quantized": test_quantized["accuracy"],
        "train_accuracy_drop": train_original["accuracy"] - train_quantized["accuracy"],
        "test_accuracy_drop": test_original["accuracy"] - test_quantized["accuracy"],

        # F1-Score
        "train_f1_original": train_original["f1"],
        "test_f1_original": test_original["f1"],
        "train_f1_quantized": train_quantized["f1"],
        "test_f1_quantized": test_quantized["f1"],
        "train_f1_drop": train_original["f1"] - train_quantized["f1"],
        "test_f1_drop": test_original["f1"] - test_quantized["f1"],

        # Precision
        "train_precision_original": train_original["precision"],
        "test_precision_original": test_original["precision"],
        "train_precision_quantized": train_quantized["precision"],
        "test_precision_quantized": test_quantized["precision"],
        "train_precision_drop": train_original["precision"] - train_quantized["precision"],
        "test_precision_drop": test_original["precision"] - test_quantized["precision"],

        # Recall
        "train_recall_original": train_original["recall"],
        "test_recall_original": test_original["recall"],
        "train_recall_quantized": train_quantized["recall"],
        "test_recall_quantized": test_quantized["recall"],
        "train_recall_drop": train_original["recall"] - train_quantized["recall"],
        "test_recall_drop": test_original["recall"] - test_quantized["recall"],
    }], index=[model_name])

    return results

In [None]:
# Quantizing all 4 models with their corresponding batch sizes (typed manually!)
model_configs = {
    "BERTweet-Base (rec4)": ("best_model_bertweet_base_rec4", 128),
    "BERTweet-Base (rec5 - HF)": ("best_model_bertweet_base_rec5", 64),
    "RoBERTa-Base-Tweet (rec4)": ("best_model_roberta_base_tweet_rec4", 128),
    "RoBERTa-Base-Tweet (rec5 - HF)": ("best_model_roberta_base_tweet_rec5", 128)
}

all_results = []

for model_name, (model_name_dir, batch_size) in model_configs.items():
    print(f"\nPost-Training Quantization Results for {model_name}:")
    results_df = quantize_evaluate_and_compare(model_name, model_name_dir, {"batch_size": batch_size})
    results_df.index.name = "model_name"
    all_results.append(results_df)
    display(results_df)

# Concatenate into one DataFrame
all_results_df = pd.concat(all_results, ignore_index=False)

In [None]:
# Display quantization results over all 4 models
display(all_results_df)

# Save for future use
save_path = f"{quant_root}/quantization_results.csv"
all_results_df.to_csv(save_path, index=True)
print(f"\nAll post-training quantization results saved to: {save_path}")

## **Technique (2) - Pruning**

As a model compression technique, **Pruning reduces model size and speeds up inference by setting "unimportant" weights to 0.**. In this project, we proceeded implementing **Unstructured global Pruning - setting a portion** (40% by default) **of trained model weights with the smallest magnitudes (in absolute values) to 0**. The function below applies **globally (on ALL LINEAR / ALL LAYERS**, depending on user need), **on a fine-tuned HF model** (which has gone through the final training phases above), i.e. it looks for the portion of weights with the smallest magnitudes (in absolute values) and prunes them - sets them to 0. It then evaluates the pruned model on the test set, and saves the quantized version.

In [None]:
# Critical roots
# basic_drive_path = "/content/drive/MyDrive" # USER CAN CHANGE IT IF HE DOESN'T WORK IN DRIVE AND DOWNLOADS FROM DRIVE THE Project_COVID_NLP folder!! (under # but the hashtag sign # can be removed if needed)
project_root = f"{basic_drive_path}/Project_COVID_NLP" # Root project folder
model_root   = f"{project_root}/Model_Weights"

# Define pruned_root inside the project, for all pruned weights
prune_root = f"{project_root}/Pruned_Model_Weights"

In [None]:
# This function prunes a fine-tuned HF model, evaluates it on training & test sets, compares its performance with the previous model's, and saves the pruned version.
# is_linear = a boolean variable set by the user whether global unstructured pruning of is desired only across the linear layers, or across all model weights. False by default.
def prune_evaluate_and_compare(model_name, model_name_dir, best_params, is_linear = False):

  # Define original model path (trained weights) and pruned save path
    model_path     = f"{model_root}/{model_name_dir}"
    pruned_path = f"{prune_root}/{model_name_dir}_pruned"

    # Select correct pretokenized dataset
    if "roberta" in model_name_dir.lower():
        pretokenized_dir = "data/tokenized_twitter_roberta_base" # the folder for saving the model
    else:
        pretokenized_dir = "data/tokenized_bertweet_base" # the folder for saving the model

    # Load model & tokenizer
    model = AutoModelForSequenceClassification.from_pretrained(model_path).to(device)
    tokenizer = AutoTokenizer.from_pretrained(model_path)

    # safety: correct dtypes + torch output
    ds = load_from_disk(pretokenized_dir) # Loads the Arrow-backed HF DatasetDict that are defines later on in the Pre-tokenization part
    for split in ds:
        ds[split] = ds[split].cast_column("input_ids", Sequence(Value("int64")))
        ds[split] = ds[split].cast_column("attention_mask", Sequence(Value("int64")))  # or "bool"
        ds[split] = ds[split].cast_column("labels", Value("int64"))
        ds[split].set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])

    # keep dynamic padding (no tokenization here—collator only pads per batch)
    collator = DataCollatorWithPadding(tokenizer, pad_to_multiple_of=8, return_tensors="pt")

    # Merge train + validation for final training
    full_train_dataset = concatenate_datasets([ds["train_reduced"], ds["validation"]])
    full_train_dataset = full_train_dataset.shuffle(seed=42) # Shuffle the model's training data to add randomness

    # initialize loaders (train & test) from the pretokenized HF dataset
    train_loader = DataLoader(
        full_train_dataset, batch_size=best_params["batch_size"], shuffle=True,
        collate_fn=collator, num_workers=4, pin_memory=True,
        persistent_workers=True, prefetch_factor=2
    )
    test_loader = DataLoader(
        ds["test"], batch_size=min(2*best_params["batch_size"], 128), shuffle=False,
        collate_fn=collator, num_workers=4, pin_memory=True,
    )

    # Evaluate the performance of original model - first:
    train_original = evaluate_model(model, train_loader, device)
    test_original = evaluate_model(model, test_loader, device)

    # Define the pruned model by reloading the original model
    pruned_model = AutoModelForSequenceClassification.from_pretrained(model_path).to(device)

    # Collect layers to be pruned:
    if is_linear:
      parameters_to_prune = [(m, "weight") for m in pruned_model.modules() if isinstance(m, nn.Linear)] # if is_linear == True -> Unstructured-global-pruning only across linear layers
    else:
      parameters_to_prune = [(m, "weight") for m in pruned_model.modules() if hasattr(m, "weight")] # otherwise -> Unstructured-global-pruning only across linear layers

    # Apply pruning (global, unstructured)
    prune.global_unstructured(
        parameters_to_prune,
        pruning_method=prune.L1Unstructured,
        amount=0.4 # set by default. Essentially, 40% of parameters of model's (linear / all, user-dependent as noted above) layers with the smallest magnitudes (absolute values) would be pruned - set to 0.
    )

    for m, n in parameters_to_prune:
      prune.remove(m, n) # removing pruned weights (already set to 0) from the pruned model, to observe the actual parameter reduction.

    # Save pruned model's config & weights
    os.makedirs(pruned_path, exist_ok=True) # Create directory if it doesn't exist
    pruned_model.save_pretrained(pruned_path)
    # torch.save(pruned_model.state_dict(), os.path.join(pruned_path, "pytorch_model.bin"))
    tokenizer.save_pretrained(pruned_path)

    # Evaluate the performance of pruned model - second:
    train_pruned = evaluate_model(pruned_model, train_loader, device)
    test_pruned = evaluate_model(pruned_model, test_loader, device)

    # Count number of parameters in both models - original & pruned:
    original_params = sum(p.numel() for p in model.parameters())
    pruned_params = sum(torch.count_nonzero(p).item() for p in pruned_model.parameters())

    # Collect results into a DataFrame
    results = pd.DataFrame([{
        "original_params": original_params,
        "pruned_params": pruned_params,
        "param_reduction": original_params - pruned_params,
        "param_ratio": pruned_params / original_params,
        # Accuracy
        "train_accuracy_original": train_original["accuracy"],
        "test_accuracy_original": test_original["accuracy"],
        "train_accuracy_pruned": train_pruned["accuracy"],
        "test_accuracy_pruned": test_pruned["accuracy"],
        "train_accuracy_drop": train_original["accuracy"] - train_pruned["accuracy"],
        "test_accuracy_drop": test_original["accuracy"] - test_pruned["accuracy"],

        # F1-Score
        "train_f1_original": train_original["f1"],
        "test_f1_original": test_original["f1"],
        "train_f1_pruned": train_pruned["f1"],
        "test_f1_pruned": test_pruned["f1"],
        "train_f1_drop": train_original["f1"] - train_pruned["f1"],
        "test_f1_drop": test_original["f1"] - test_pruned["f1"],

        # Precision
        "train_precision_original": train_original["precision"],
        "test_precision_original": test_original["precision"],
        "train_precision_pruned": train_pruned["precision"],
        "test_precision_pruned": test_pruned["precision"],
        "train_precision_drop": train_original["precision"] - train_pruned["precision"],
        "test_precision_drop": test_original["precision"] - test_pruned["precision"],

        # Recall
        "train_recall_original": train_original["recall"],
        "test_recall_original": test_original["recall"],
        "train_recall_pruned": train_pruned["recall"],
        "test_recall_pruned": test_pruned["recall"],
        "train_recall_drop": train_original["recall"] - train_pruned["recall"],
        "test_recall_drop": test_original["recall"] - test_pruned["recall"],
    }], index=[model_name])

    return results

In [None]:
# Pruning all 4 models with their corresponding batch sizes (typed manually!), considering ONLY LINEAR layers
model_configs = {
    "BERTweet-Base (rec4)": ("best_model_bertweet_base_rec4", 128),
    "BERTweet-Base (rec5 - HF)": ("best_model_bertweet_base_rec5", 64),
    "RoBERTa-Base-Tweet (rec4)": ("best_model_roberta_base_tweet_rec4", 128),
    "RoBERTa-Base-Tweet (rec5 - HF)": ("best_model_roberta_base_tweet_rec5", 128)
}

all_results = []

for model_name, (model_name_dir, batch_size) in model_configs.items():
    print(f"\nUnstructured global Pruning Results for {model_name} (considering linear layers only):")
    results_df = prune_evaluate_and_compare(model_name, model_name_dir, {"batch_size": batch_size}, is_linear = True)
    results_df.index.name = "model_name"
    all_results.append(results_df)
    display(results_df)

# Concatenate into one DataFrame
all_results_df = pd.concat(all_results, ignore_index=False)

In [None]:
# Display pruning results over all 4 models - "LINEAR CASE"
display(all_results_df)

# Save for future use
save_path = f"{prune_root}/pruning_results_linear.csv"
all_results_df.to_csv(save_path, index=True)
print(f"\nAll unstructured global pruning results (considering linear layers only) saved to: {save_path} ")

In [None]:
# Pruning all 4 models with their corresponding batch sizes (typed manually!), considering ALL model layers
all_results = []

for model_name, (model_name_dir, batch_size) in model_configs.items():
    print(f"\nUnstructured global Pruning Results for {model_name} (considering all model layers):")
    results_df = prune_evaluate_and_compare(model_name, model_name_dir, {"batch_size": batch_size}, is_linear = False)
    results_df.index.name = "model_name"
    all_results.append(results_df)
    display(results_df)

# Concatenate into one DataFrame
all_results_df = pd.concat(all_results, ignore_index=False)

In [None]:
# Display pruning results over all 4 models - "GENERALIZED CASE"
display(all_results_df)

# Save for future use
save_path = f"{prune_root}/pruning_results_generalized.csv"
all_results_df.to_csv(save_path, index=True)
print(f"\nAll unstructured global pruning results (considering all model layers) saved to: {save_path}")

## **Technique (3) - Knowledge-Distillation (KD)**

As a compression technique, **Knowledge Distillation** reduces model size by **training a compact student model** (compact = with much less parameters) **to imitate a stronger teacher model by matching the teacher’s soft predictions while still learning from the gold labels**. In our pipeline the fine-tuned teacher from Model_Weights is frozen and evaluated in eval() mode, and the student (e.g., arampacha/roberta-tiny) is optimized with the mixed objective α·CE(y, s) + (1−α)·T²·KL(softmax(t/T) || log_softmax(s/T)), where T is the temperature and α controls the balance between label supervision and teacher guidance. We train on the pre-tokenized datasets, log train and test metrics for both teacher and student to Weights & Biases, and save the best student checkpoint under KD_Model_Weights, along with a CSV of results for later comparison.

In [None]:
# Critical roots
# basic_drive_path = "/content/drive/MyDrive" # USER CAN CHANGE IT IF HE DOESN'T WORK IN DRIVE AND DOWNLOADS FROM DRIVE THE Project_COVID_NLP folder!! (under # but the hashtag sign # can be removed if needed)
project_root = f"{basic_drive_path}/Project_COVID_NLP" # Root project folder
model_root   = f"{project_root}/Model_Weights"

# Define KD_root inside the project, for all KD weights
KD_root = f"{project_root}/KD_Model_Weights"

In [None]:
from transformers import TrainerCallback

class TrainEvalCallback(TrainerCallback):
    def __init__(self, trainer, train_dataset):
        self.trainer = trainer
        self.train_dataset = train_dataset

    def on_evaluate(self, args, state, control, metrics=None, **kwargs):
        # HF already computed eval_* on the test set this epoch.
        if metrics and wandb.run is not None:
            out = {}
            for k, v in metrics.items():
                if isinstance(v, (int, float)):
                    key = k[5:] if k.startswith("eval_") else k
                    out[f"test/student_{key}"] = float(v)
            wandb.log(out)
        return control

    def on_epoch_end(self, args, state, control, **kwargs):
        # Compute train metrics once per epoch
        train_metrics = self.trainer.evaluate(
            eval_dataset=self.train_dataset,
            metric_key_prefix="train"  # -> train_loss, train_accuracy, ...
        )

        # 1) HF log keeps the pretty table
        self.trainer.log(train_metrics)

        # 1a) ALSO put 'loss' so the "Training Loss" column isn't "No log"
        if "train_loss" in train_metrics:
            self.trainer.log({"loss": float(train_metrics["train_loss"])})

        # 2) W&B: log under train/student_* (no explicit step)
        if wandb.run is not None:
            out = {}
            for k, v in train_metrics.items():
                if isinstance(v, (int, float)):
                    key = k[6:] if k.startswith("train_") else k
                    out[f"train/student_{key}"] = float(v)
            wandb.log(out)
        return control

In [None]:
class DistillationTrainer(Trainer):
    def __init__(self, *args, teacher_model=None, temperature=2.0, alpha=0.5, **kwargs):
        super().__init__(*args, **kwargs)
        # freeze + eval teacher
        self.teacher = teacher_model
        self.teacher.eval()
        for p in self.teacher.parameters():
            p.requires_grad = False
        self.temperature = float(temperature)
        self.alpha = float(alpha)

    @torch.no_grad()
    def _teacher_forward(self, **inputs):
        # teacher never sees labels
        inputs = {k: v for k, v in inputs.items() if k != "labels"}
        return self.teacher(**inputs)

    def compute_loss(self, model, inputs, return_outputs=False, num_items_in_batch=None):
        # 1) remove labels so student model doesn't compute internal CE
        labels = inputs.pop("labels")
        if labels.dtype != torch.long:
            labels = labels.long()
        # cheap guard; safe even if always 0..4
        if torch.any(labels.lt(0)) or torch.any(labels.gt(4)):
            labels = labels.clamp_(0, 4)

        # 2) student forward (no labels)
        outputs_student = model(**inputs)

        # 3) teacher forward (eval, no grads)
        with torch.no_grad():
            outputs_teacher = self._teacher_forward(**inputs)

        # 4) KD loss
        t = self.temperature
        loss_ce = F.cross_entropy(outputs_student.logits, labels)
        loss_kl = F.kl_div(
            F.log_softmax(outputs_student.logits / t, dim=-1),
            F.softmax(outputs_teacher.logits / t, dim=-1),
            reduction="batchmean"
        ) * (t * t)

        loss = self.alpha * loss_ce + (1.0 - self.alpha) * loss_kl
        return (loss, outputs_student) if return_outputs else loss

In [None]:
def distill_evaluate_and_compare(model_name, model_name_dir, best_params, student_model_name, alpha=0.5, temperature=2.0, num_epochs=5):

    # Paths
    teacher_path = f"{model_root}/{model_name_dir}"
    student_slug = student_model_name.replace("/", "-")
    KD_path      = f"{KD_root}/{model_name_dir}_distilled_student_{student_slug}"
    os.makedirs(KD_path, exist_ok=True)

    # Select correct pretokenized dataset
    if "roberta" in model_name_dir.lower():
        pretokenized_dir = "data/tokenized_twitter_roberta_base"
    else:
        pretokenized_dir = "data/tokenized_bertweet_base"

    # safety: correct dtypes + torch output
    ds = load_from_disk(pretokenized_dir)
    for split in ds:
        ds[split] = ds[split].cast_column("input_ids", Sequence(Value("int64")))
        ds[split] = ds[split].cast_column("attention_mask", Sequence(Value("int64")))
        ds[split] = ds[split].cast_column("labels", Value("int64"))
        ds[split].set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])

    # Merge train + validation for final training
    full_train_dataset = concatenate_datasets([ds["train_reduced"], ds["validation"]]).shuffle(seed=42)
    test_dataset = ds["test"]

    # Load Teacher + tokenizer
    teacher = AutoModelForSequenceClassification.from_pretrained(teacher_path).to(device)
    teacher.eval()
    for p in teacher.parameters():
        p.requires_grad = False
    tokenizer = AutoTokenizer.from_pretrained(teacher_path)

    # Load Student + tokenizer (vocab aligned)
    student = AutoModelForSequenceClassification.from_pretrained(student_model_name, num_labels=5, ignore_mismatched_sizes=True).to(device)
    student_tokenizer = tokenizer
    student.resize_token_embeddings(len(student_tokenizer))

    # keep dynamic padding (no tokenization here—collator only pads per batch)
    collator = DataCollatorWithPadding(tokenizer, pad_to_multiple_of=8, return_tensors="pt")

    # W&B init (project name includes teacher dir + student)
    if wandb.run is not None:
        wandb.finish()
    wandb.init(
        project=f"KD_{model_name}__student_{student_slug}_21.8.2025",
        entity="idoshahar96-tel-aviv-university",
        config={
            "learning_rate": best_params["learning_rate"],
            "weight_decay": best_params["weight_decay"],
            "batch_size": best_params["batch_size"],
            "num_layers_finetune": best_params.get("num_layers_finetune", None),
            "teacher_path": teacher_path,
            "student_model": student_model_name,
            "alpha": alpha,
            "temperature": temperature,
            "epochs": num_epochs,
        },
        name=f"KD_{model_name}__{student_slug}",
        reinit=True
    )

    # TrainingArguments for the Hugging Face Trainer
    training_args = TrainingArguments(
        output_dir=KD_path, # where checkpoints will be saved
        per_device_train_batch_size=best_params["batch_size"],
        per_device_eval_batch_size=best_params["batch_size"],
        learning_rate=best_params["learning_rate"],
        weight_decay=best_params["weight_decay"],
        num_train_epochs=num_epochs, # Setting the number of epochs for training - 5
        eval_strategy="epoch",        # evaluate at the end of each epoch
        save_strategy="epoch",        # save a checkpoint at the end of each epoch
        logging_strategy="epoch",     # log metrics at the end of each epoch
        load_best_model_at_end=True,  # reload the best checkpoint (based on metric_for_best_model)
        metric_for_best_model="accuracy", # optimize w.r.t accuracy
        greater_is_better=True,
        save_total_limit=1,
        remove_unused_columns=False,
        label_names=["labels"],
        report_to="wandb",
    )

    # Trainer
    trainer_distill = DistillationTrainer(
        model=student,
        teacher_model=teacher,
        args=training_args,
        train_dataset=full_train_dataset,
        eval_dataset=test_dataset,
        data_collator=collator,
        compute_metrics=compute_metrics,
        temperature=temperature,
        alpha=alpha,
    )

    # per-epoch: TEST (from on_evaluate) then TRAIN (from on_epoch_end)
    trainer_distill.add_callback(TrainEvalCallback(trainer_distill, full_train_dataset))

    # TRAIN - KD
    trainer_distill.train()
    print("\nDistillation complete. Student trained & best model loaded.")

    # Final metrics (TEST first, then TRAIN) for teacher & student
    def _eval_with(model_to_eval, dataset, prefix):
        tmp = Trainer(
            model=model_to_eval,
            args=training_args,
            eval_dataset=dataset,
            data_collator=collator,
            compute_metrics=compute_metrics,
        )
        return tmp.evaluate(metric_key_prefix=prefix)

    # Teacher metrics
    teacher_test   = _eval_with(teacher, test_dataset,        "teacher_test")
    teacher_train  = _eval_with(teacher, full_train_dataset, "teacher_train")

    # Student metrics
    student_test   = trainer_distill.evaluate(eval_dataset=test_dataset,        metric_key_prefix="student_test")
    student_train  = trainer_distill.evaluate(eval_dataset=full_train_dataset, metric_key_prefix="student_train")

    # Log to W&B (TEST first, then TRAIN) so panels order naturally
    if wandb.run is not None:
        def log_group(prefix, who, d):
            out = {}
            for k, v in d.items():
                if isinstance(v, (int, float)):
                    key = k.split(f"{who}_", 1)[-1] if f"{who}_" in k else k
                    out[f"{prefix}/{who}_{key}"] = float(v)
            wandb.log(out)

        log_group("test",  "teacher", teacher_test)
        log_group("test",  "student", student_test)
        log_group("train", "teacher", teacher_train)
        log_group("train", "student", student_train)

    # Count params & save student model (best-epoch)
    teacher_params = sum(p.numel() for p in teacher.parameters())
    student_params = sum(p.numel() for p in student.parameters())
    print(f"Teacher params: {teacher_params:,}")
    print(f"Student params: {student_params:,}")
    trainer_distill.model.save_pretrained(KD_path)
    student_tokenizer.save_pretrained(KD_path)
    print(f"Best student model saved to {KD_path}")
    wandb.finish()

    # Build output DataFrame (params statistics, then metrics, then drops in metrics while comparing the models)
    def g(d, k):
        v = d.get(k, float("nan"))
        try:
            return float(v)
        except Exception:
            return float("nan")

    def pack(metric_name, t_train, t_test, s_train, s_test):
        tt = g(t_train, f"teacher_train_{metric_name}")
        te = g(t_test,  f"teacher_test_{metric_name}")
        st = g(s_train, f"student_train_{metric_name}")
        se = g(s_test,  f"student_test_{metric_name}")
        return {
            f"teacher_train_{metric_name}": tt,
            f"teacher_test_{metric_name}":  te,
            f"student_train_{metric_name}": st,
            f"student_test_{metric_name}":  se,
            f"drop_train_{metric_name}":    tt - st,  # teacher − student
            f"drop_test_{metric_name}":     te - se,
        }

    row = {
        "teacher_params": teacher_params,
        "student_params": student_params,
        "param_reduction": teacher_params - student_params,
        "param_ratio": student_params / teacher_params,
    }
    for m in ["accuracy", "f1", "precision", "recall"]:
        row.update(pack(m, teacher_train, teacher_test, student_train, student_test))

    results = pd.DataFrame([row], index=[model_name])
    return results

In [None]:
# Performing Knowledge-Distillation (KD) over all 4 models with their corresponding best-params (typed manually!)
model_configs = {
    "BERTweet-Base (rec4)": ("best_model_bertweet_base_rec4", {'learning_rate': 0.0001184412471705182, 'weight_decay': 1.2699696348040995e-05, 'patience': 10, 'batch_size': 128, 'num_layers_finetune': 3}),
    "BERTweet-Base (rec5 - HF)": ("best_model_bertweet_base_rec5", {'learning_rate': 7.668855564109297e-05, 'weight_decay': 4.8978169582912055e-06, 'patience': 9, 'batch_size': 64, 'num_layers_finetune': 3, 'lr_scheduler_type': 'linear'}),
    "RoBERTa-Base-Tweet (rec4)": ("best_model_roberta_base_tweet_rec4", {'learning_rate': 0.0003834791389042033, 'weight_decay': 2.88286253103848e-06, 'patience': 7, 'batch_size': 128, 'num_layers_finetune': 3}),
    "RoBERTa-Base-Tweet (rec5 - HF)": ("best_model_roberta_base_tweet_rec5", {'learning_rate': 0.0000860370374400373, 'weight_decay': 0.00008459884214639005, 'patience': 10, 'batch_size': 128, 'num_layers_finetune': 3, 'lr_scheduler_type': 'polynomial'})
}

all_results = []
student_model_name = "arampacha/roberta-tiny" # Example student model - BERT-Tiny (truly)

for model_name, (model_name_dir, best_params) in model_configs.items():
    print(f"\nKnowledge-Distillation (KD) Results for TEACHER: {model_name}, STUDENT: {student_model_name} (5 epochs):")
    results_df = distill_evaluate_and_compare(model_name, model_name_dir, best_params, student_model_name=student_model_name)
    results_df.index.name = "model_name"
    all_results.append(results_df)
    display(results_df)

# Concatenate into one DataFrame
all_results_df = pd.concat(all_results, ignore_index=False)

In [None]:
# Add student name column (redundant because the student model name would be clear from the CSV file directory, but we wanted to make the results even clearer, by displaying the student model name even more explicitly)
all_results_df["student_model_name"] = "arampacha-roberta-tiny"

# Reorder so "student_model_name" is right after the index
cols = all_results_df.columns.tolist()
cols = ["student_model_name"] + [c for c in cols if c != "student_model_name"]
all_results_df = all_results_df[cols]
all_results_df["student_model_name"] = "arampacha-roberta-tiny"
# Display Knowledge-Distillation (KD) results over all 4 models
display(all_results_df)

student_model_name_for_csv = "arampacha_roberta_tiny"

# Save for future use
save_path = f"{KD_root}/KD_results_{student_model_name_for_csv}.csv"
all_results_df.to_csv(save_path, index=True)
print(f"\nAll Knowledge-Distillation (KD) results saved to: {save_path}")