# Natural Languange Processing (NLP)- Advanced Topics in DL

Group Z:</br>
    - Iliya Morgunov - 206361412</br>
    - Eadan Schechter - 209793553

# Imports

In [None]:
#pip install -U transformers

In [None]:
#pip install transformers[torch]

In [None]:
#pip install optuna wandb

In [None]:
import pandas as pd
import numpy as np

from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score

import time
import os
os.environ["WANDB_SILENT"] = "true"
import shutil
import re

from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, EarlyStoppingCallback, Trainer
from datasets import Dataset as HFDataset        # Hugging Face

import torch
from torch import nn, optim
from torch.utils.data import Dataset as TorchDataset, DataLoader  # PyTorch
import torch.nn.functional as F
from torch.quantization import quantize_dynamic
from torch.nn.utils import prune

from datetime import datetime
from tqdm import tqdm

import optuna
optuna.logging.set_verbosity(optuna.logging.ERROR)
import wandb
wandb.login(key="a4366556f2db644bb48872fb5da34a12fcdb200d")


import warnings
from transformers.utils import logging as hf_logging

warnings.filterwarnings("ignore")
hf_logging.set_verbosity_error()

In [None]:
df_train = pd.read_csv("df_train_final.csv")
df_test = pd.read_csv("df_test_final.csv")

# Part 2

In [None]:
import torch
print(torch.cuda.is_available())
print(torch.version.cuda)
print(torch.backends.cudnn.version())


True
12.4
90300


In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

## Fine-tuning BERTweet

BERTweet is a RoBERTa-based language model specifically pre-trained on over 850 million English tweets to capture the linguistic nuances and informal conventions of social media text. Academic research has shown that BERTweet achieves state-of-the-art results for tweet sentiment analysis. For example, Nguyen et al. (2020) demonstrated that BERTweet substantially outperformed general-domain models like RoBERTa and XLM-R when classifying tweet sentiment, improving the previous state-of-the-art by approximately 5% absolute F1 on the SemEval-2017 Twitter sentiment benchmark. This improvement translates to a significantly higher accuracy in classifying tweets as positive, negative, or neutral, compared to earlier transformer-based approaches. The model’s strong performance is attributed to its large-scale, domain-specific pre-training, which allows it to recognize Twitter slang, emojis, hashtags, and informal syntax much better than generic BERT variants.

The BERTweet pre-training corpus is highly relevant to our own project. It consists of an 80GB collection of uncompressed text containing 850 million tweets (about 16 billion word tokens), gathered from two main sources. The first and largest corpus (845M tweets) was collected from the Twitter Stream Archive, spanning from January 2012 to August 2019, and includes only English-language tweets. These tweets were pre-processed with the TweetTokenizer from the NLTK toolkit, and emojis were converted into text. User mentions and URLs were normalized into special tokens (@USER, HTTPURL), and only tweets containing between 10 and 64 tokens were retained, with retweets excluded. Notably, the second component of the pre-training data includes 5 million English tweets related to the COVID-19 pandemic, collected between January and March 2020—an interval that partially overlaps with the timeline of the data in our own analysis. This overlap suggests that BERTweet is well-equipped to handle the vocabulary, topics, and sentiment expressions specific to the COVID-19 period.

BERTweet is publicly available and can be easily downloaded (e.g., from the Hugging Face Hub as vinai/bertweet-base) and fine-tuned in PyTorch for sentiment classification tasks. In summary, BERTweet represents a best-in-class choice for Twitter sentiment analysis, as it combines the robust RoBERTa architecture with domain knowledge of tweets—including some from the same time window as our dataset—enabling more accurate and contextually aware sentiment modeling.

In [None]:
# Load the BERTweet model and tokenizer and inspect
tokenizer = AutoTokenizer.from_pretrained("vinai/bertweet-base", use_fast=False)
try:
    model = AutoModelForSequenceClassification.from_pretrained("vinai/bertweet-base", num_labels=5).to(device) # We have 5 sentiments
except:
    model = AutoModelForSequenceClassification.from_pretrained("vinai/bertweet-base", num_labels=5, use_safetensors=True).to(device) # We have 5 sentiments

#model = AutoModelForSequenceClassification.from_pretrained("vinai/bertweet-base", num_labels=5, use_safetensors=True).to(device) # We have 5 sentiments
model

RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(64001, 768, padding_idx=1)
      (position_embeddings): Embedding(130, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
         

### PyTorch Fine-Tuning

The TweetDataset class inherits from PyTorch’s Dataset class, enabling us to efficiently handle and preprocess tweet data for model training.

This custom dataset class is especially useful for working with data formats *not directly supported by PyTorch's built-in datasets, and text data that requires tokenization and transformation before being fed into a neural network.

The three essential methods are:

__init__: Initializes the dataset with the raw tweet texts, corresponding sentiment labels, tokenizer, and any additional settings.

__len__: Returns the total number of tweet samples in the dataset, which allows PyTorch DataLoader to know how many batches to create.

__getitem__: Retrieves a single tweet (data sample) and its label by index, applies the tokenizer to convert the tweet into input features (input_ids, attention_mask), and returns the tensors ready for model input.

This structure allows for flexible and efficient batching, shuffling, and preprocessing of tweets during model training and evaluation.

We now implement the TweetDataset class and prepare our data for PyTorch training. The sequence length was selected based on our EDA (see previous section), and all data and labels are processed for efficient batching.

For model training, we use a standard fine-tuning pipeline with early stopping, hyperparameter tuning via Optuna, and experiment tracking with Weights & Biases (W&B), following best practices introduced in course Exercise 4.

In [None]:
class TweetDataset(TorchDataset): # Dataset Class
    """
    PyTorch Dataset for tweet sentiment analysis using a DataFrame.
    Processes each row to create tokenized input for transformer models
    """

    def __init__(self, dataframe, tokenizer, text_col="OriginalTweet", label_col="label", max_len=80):
        """
        Args:
            dataframe (pd.DataFrame): DataFrame containing tweets and labels.
            tokenizer: Hugging Face tokenizer instance.
            text_col (str): Column name containing tweet text.
            label_col (str): Column name containing sentiment label (as integer or already encoded).
            max_len (int): Maximum sequence length for model input.
        """

        self.texts = dataframe[text_col].astype(str).tolist()
        self.labels = dataframe[label_col].tolist()
        self.tokenizer = tokenizer
        self.max_len = max_len # Ensures all sequences have the same length (Maximum tweet length allowed)

    def __len__(self):
        # Returns total number of tweets
        return len(self.texts)

    def __getitem__(self, idx):
         # Retrieves and processes a single tweet at the given index.

        text = self.texts[idx]     # Raw tweet text
        label = self.labels[idx]        # Corresponding sentiment label

        # Tokenize the tweet with padding/truncation to max_len using the provided tokenizer
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,    # Pad/truncate to this length (Ensures uniform input size)
            padding='max_length',       # Add padding to reach the maximum length (pad shorter tweets to max_len)
            truncation=True,            # Truncate if longer than max_len
            return_attention_mask=True, # Generate attention mask for BERT
            return_tensors='pt'         # Output as PyTorch tensors
        )

        #  Return a dictionary containing tokenized inputs, attention masks, and the label
        return {
            'input_ids': encoding['input_ids'].squeeze(),           # Tensor of size [max_len]
            'attention_mask': encoding['attention_mask'].squeeze(), # Tensor of size [max_len]
            'labels': torch.tensor(label, dtype=torch.long)         # Scalar tensor
        }

Apply Label Encoding to the data set sentiments

In [None]:
# Map Sentiment to 0-4 integers (sentiment columns are now integer incoded)
sentiment_classes = ['Negative', 'Neutral', 'Positive', 'Extremely Negative', 'Extremely Positive']
label2id = {label: idx for idx, label in enumerate(sentiment_classes)}
df_test['label'] = df_test['Sentiment'].map(label2id) # Full test set
df_train['label'] = df_train['Sentiment'].map(label2id) # Full train set

#### Early stopping function

In [None]:
def early_stop_check(patience, best_val_f1, best_val_f1_epoch, current_val_f1, current_epoch):
    early_stop_flag = False # Initialize flag to be False (no need to early stop
    if current_val_f1 > best_val_f1:
        # If we improved the F1, update the parameteres holding the best val f1 details
        best_val_f1 = current_val_f1
        best_val_f1_epoch = current_epoch
    else:
        # No improvement, check for patience: check if there has been more than the acceptable number of epochs where val f1 hasn't improved
        if current_epoch - best_val_f1_epoch > patience:
            early_stop_flag = True # Change flag
    return best_val_f1, best_val_f1_epoch, early_stop_flag

#### Training Function

In [None]:
def train_model_with_hyperparams(model, model_name, train_loader, val_loader, optimizer, criterion, epochs, patience, trial):
    best_val_f1  = 0.0 # Keep track of best f1
    best_val_f1_epoch  = 0
    early_stop_flag = False
    best_model_state = None

    for epoch in range(1, epochs + 1):
        model.train() # Enable training mode
        train_loss = 0.0
        total_train_samples = 0
        correct_train_predictions = 0

        for batch in train_loader: #Iterates over the train_loader, which is a DataLoader object containing batches of training data.
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            optimizer.zero_grad() # Reset gradients
            outputs = model(input_ids, attention_mask=attention_mask) # Forward pass
            logits = outputs.logits # save the logits (the raw output of the model)
            loss = criterion(logits, labels) # Calculate loss

            loss.backward() # Backward pass
            optimizer.step() # Update weights using the optimizer

            # Accumulate training loss and predictions
            train_loss += loss.item() * input_ids.size(0)
            total_train_samples += input_ids.size(0)
            correct_train_predictions += (logits.argmax(dim=1) == labels).sum().item()

        train_loss /= total_train_samples
        train_accuracy = correct_train_predictions / total_train_samples

        ###  Validation loop  ###
        model.eval() # Enable evaluation mode
        val_loss = 0.0
        total_val_samples = 0
        correct_val_predictions = 0

        all_val_labels = []
        all_val_preds = []

        with torch.no_grad(): # Disable gradient computation
            for batch in val_loader: # iterate on the val_loader's batches
                input_ids = batch['input_ids'].to(device)
                attention_mask = batch['attention_mask'].to(device)
                labels = batch['labels'].to(device)

                outputs = model(input_ids, attention_mask=attention_mask)
                logits = outputs.logits
                loss = criterion(logits, labels)

                val_loss += loss.item() * input_ids.size(0)
                total_val_samples += input_ids.size(0)
                correct_val_predictions += (logits.argmax(dim=1) == labels).sum().item()

                all_val_labels.extend(labels.cpu().numpy())
                all_val_preds.extend(logits.argmax(dim=1).cpu().numpy())

        # calculate metrics
        val_loss /= total_val_samples
        val_accuracy = correct_val_predictions / total_val_samples
        val_precision = precision_score(all_val_labels, all_val_preds, average='macro')
        val_recall = recall_score(all_val_labels, all_val_preds, average='macro')
        val_f1 = f1_score(all_val_labels, all_val_preds, average='macro')

        if patience is not None:
            # Check for early stopping
            best_val_f1, best_val_f1_epoch, early_stop_flag = early_stop_check(patience, best_val_f1, best_val_f1_epoch, val_f1, epoch)

        # Save the best model under the best_model_state parameter
        if val_f1 == best_val_f1:
            best_model_state = model.state_dict()

        # Log metrics to Weights & Biases
        if wandb.run is not None:
            wandb.log({
                "Epoch": epoch,
                "Train Loss": train_loss,
                "Train Accuracy": train_accuracy,
                "Validation Loss": val_loss,
                "Validation Accuracy": val_accuracy,
                "Validation Precision": val_precision,
                "Validation Recall": val_recall,
                "Validation F1": val_f1})

        if early_stop_flag:  # Checks whether the early stopping condition has been met, as indicated by the early_stop_flag
            break # Exits the training loop immediately if the early stopping condition is satisfied

    if best_model_state is not None:
    # Save the best model if tracked (e.g., via early stopping or best F1)
        if trial is not None:
            torch.save(best_model_state, f"best_model_trial_{trial.number}.pt")
        else:
            torch.save(best_model_state, f"{model_name.replace('/', '-')}_fine_tuned_pytorch_model.pt")
    else:
        # No best state was tracked — save the final model as-is
        torch.save(model.state_dict(), f"{model_name.replace('/', '-')}_fine_tuned_pytorch_model.pt")

    return val_f1

#### Optuna Function & Study

In [None]:
# Objective Function for Optuna
def objective(trial, train_df, tokenizer, model_name, text_col, label_col, device):

    """
    Generic Optuna objective function for HuggingFace transformer-based models (e.g., BERTweet, DeBERTa).
    """

    # Hyperparameter suggestions
    learning_rate = trial.suggest_loguniform("learning_rate", 1e-5, 1e-3)
    weight_decay = trial.suggest_loguniform("weight_decay", 1e-6, 1e-4)
    patience = trial.suggest_int("patience", 5, 7)
    batch_size = trial.suggest_categorical("batch_size", [32, 64])
    epochs = trial.suggest_int("epochs", 10, 25, step=5)
    num_layers = trial.suggest_int("num_layers", 1, 1)

    # For cross-validation
    skf = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)
    f1_scores = []

    # Extract features and labels as lists (so indices match)
    X = train_df[text_col].tolist()
    y = train_df[label_col].tolist()

    for fold, (train_idx, val_idx) in enumerate(skf.split(X, y)):
        X_train = [X[i] for i in train_idx]
        y_train = [y[i] for i in train_idx]
        X_val = [X[i] for i in val_idx]
        y_val = [y[i] for i in val_idx]

        train_fold_df = pd.DataFrame({text_col: X_train, label_col: y_train})
        val_fold_df = pd.DataFrame({text_col: X_val, label_col: y_val})

        train_dataset = TweetDataset(train_fold_df, tokenizer, text_col=text_col, label_col=label_col, max_len=80) # Create the TweetDataset object
        val_dataset = TweetDataset(val_fold_df, tokenizer, text_col=text_col, label_col=label_col, max_len=80) # Create the TweetDataset object

        train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True) # Insert into a DataLoader
        val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False) # Insert into a DataLoader

        # Initialize a model (generic) for each fold
        try:
            model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=5).to(device)
        except:
            model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=5, use_safetensors=True).to(device)

        #model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=5, use_safetensors=True).to(device)

        # Freezing logic for DeBERTa
        if hasattr(model, "deberta") and hasattr(model.deberta, "encoder"):
            # Freeze all DeBERTa encoder parameters
            for param in model.deberta.parameters():
                param.requires_grad = False

            # Unfreeze the last `num_layers` DeBERTa encoder layers
            encoder_layers = model.deberta.encoder.layer
            for layer in encoder_layers[-num_layers:]:
                for param in layer.parameters():
                    param.requires_grad = True

        # Freezing logic for RoBERTa type models (e.g BERTweet)
        elif hasattr(model, "roberta") and hasattr(model.roberta, "encoder"):
            for param in model.roberta.parameters():
                param.requires_grad = False

            encoder_layers = model.roberta.encoder.layer
            for layer in encoder_layers[-num_layers:]:
                for param in layer.parameters():
                    param.requires_grad = True

        # Always unfreeze classifier (for any model applied)
        for param in model.classifier.parameters():
            param.requires_grad = True

        # Define optimizer and loss function
        criterion = nn.CrossEntropyLoss()
        optimizer = optim.Adam(model.parameters(), lr=learning_rate, weight_decay=weight_decay)

        # Initialize Weights & Biases logging - the values in the config are the properties of each trial.
        wandb.init(project=f"{model_name.replace('/', '-')} Sentiment Analysis Fine-Tuning Full Code - {datetime.now().date().strftime('%Y%m%d')}",
            config={
            "learning_rate": learning_rate,
            "weight_decay": weight_decay,
            "patience": patience,
            "batch_size": batch_size,
            "num_layers": num_layers,
            "architecture": model_name,
            "dataset": "Corona_NLP"
            },
            name=f"trial_{trial.number}_fold_{fold}", # The name that will be saved in the W&B platform
            reinit=True)

        # Train the model for this fold and get the best validation f1
        best_val_f1 = train_model_with_hyperparams(model, model_name, train_loader, val_loader, optimizer, criterion, epochs = epochs, patience=patience, trial=trial)


        # Append fold validation accuracy
        f1_scores.append(best_val_f1)

        # Finish the Weights & Biases run
        wandb.finish()

    mean_f1 = np.mean(f1_scores)

    # Clean up CUDA + wandb
    import gc
    torch.cuda.empty_cache()
    gc.collect()

    if wandb.run is not None:
        wandb.finish()

    return mean_f1 # Return best mean validation f1

In [None]:
n_trials = 3
pbar = tqdm(total=n_trials, desc="Optuna Trials")

def tqdm_callback(study, trial):
    pbar.update(1)

# Optimize - Run Optuna study
bertweet_study = optuna.create_study(direction="maximize") # Specifies that the goal of the optimization is to maximize the objective function (optimize mean f1)
bertweet_study.optimize(lambda trial: objective(trial, df_train, tokenizer, "vinai/bertweet-base", "OriginalTweet", "label", device), n_trials=n_trials, callbacks=[tqdm_callback])

pbar.close()

# Get best trial info
best_trial = bertweet_study.best_trial
bertweet_best_params = bertweet_study.best_params


Optuna Trials:   0%|          | 0/3 [00:37<?, ?it/s]

Optuna Trials:  33%|███▎      | 1/3 [56:34<1:53:09, 3394.57s/it][A
Optuna Trials:  67%|██████▋   | 2/3 [1:34:30<45:36, 2736.71s/it][A
Optuna Trials: 100%|██████████| 3/3 [2:06:09<00:00, 2523.30s/it]


In [None]:
# Print the best hyperparameters
print("Best trial:")
print(f"  Validation F1-score: {best_trial.value}")
print("  Hyperparameters: ")
for key, value in best_trial.params.items():
    print(f"    {key}: {value}")

Best trial:
  Validation F1-score: 0.6386208620181888
  Hyperparameters: 
    learning_rate: 0.000576610266987812
    weight_decay: 4.3717724210425024e-06
    patience: 7
    batch_size: 32
    epochs: 10
    num_layers: 1


#### Train again and test

Instead of saving the best model from a single fold during cross-validation, we need to retrain the model from scratch on the entire training set (excluding the test set), using the best hyperparameters selected by Optuna. This is applied in order to:
Maximizing Training Data: Retraining on the full dataset allowed the model to benefit from more examples than in any single fold, potentially improving generalization
Avoiding Fold Bias: Since no single fold sees the full training pool during cross-validation, retraining ensures the final model is not biased toward a specific validation split

In [None]:
def retrain_transformer_full(train_df, test_df, best_params, tokenizer, model_name, label_col, text_col, device=None):

    """
    Retrain the model on the full train_df using the best found hyperparameters form the Optuna study
    """

    if device is None:
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    # Prepare dataset and dataloader for full train
    train_dataset = TweetDataset(train_df, tokenizer, text_col=text_col, label_col=label_col, max_len=80)
    train_loader = DataLoader(train_dataset, batch_size=best_params['batch_size'], shuffle=True)


    test_dataset = TweetDataset(test_df, tokenizer, text_col=text_col, label_col=label_col, max_len=80)
    test_loader = DataLoader(test_dataset, batch_size=best_params['batch_size'], shuffle=False)

    # Initialize a fresh model for final training
    try:
        model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=5).to(device)
    except:
        model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=5, use_safetensors=True).to(device)

    #model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=5, use_safetensors=True).to(device)

    # Freeze & unfreeze layers according to best_params
    num_layers = best_params['num_layers']

    # Freezing logic for DeBERTa
    if hasattr(model, "deberta") and hasattr(model.deberta, "encoder"):
        # Freeze all DeBERTa encoder parameters
        for param in model.deberta.parameters():
            param.requires_grad = False

        # Unfreeze the last `num_layers` DeBERTa encoder layers
        encoder_layers = model.deberta.encoder.layer
        for layer in encoder_layers[-num_layers:]:
            for param in layer.parameters():
                param.requires_grad = True

    # Freezing logic for RoBERTa type models (e.g BERTweet)
    elif hasattr(model, "roberta") and hasattr(model.roberta, "encoder"):
        for param in model.roberta.parameters():
            param.requires_grad = False

        # Unfreeze the last `num_layers` of the encoder layers
        encoder_layers = model.roberta.encoder.layer
        for layer in encoder_layers[-num_layers:]:
            for param in layer.parameters():
                param.requires_grad = True

    # Always unfreeze classifier (for any model applied)
    for param in model.classifier.parameters():
        param.requires_grad = True

    optimizer = optim.Adam(model.parameters(), lr=best_params['learning_rate'], weight_decay=best_params['weight_decay'])
    criterion = nn.CrossEntropyLoss()

    # Start W&B logging for the final training
    run_name = f"Final_Train_{model_name.replace('/', '-')}_{datetime.now().strftime('%Y%m%d_%H%M%S')}"
    wandb.init(
        project=f"Final_{model_name.replace('/', '-')}_Retrain_{datetime.now().date().strftime('%Y%m%d')}",
        config=best_params,
        name=run_name,
        reinit=True,
    )

    # Retrain model on train_loader and evaluate on test_loader
    train_model_with_hyperparams(
        model,
        model_name=model_name,
        train_loader=train_loader,
        val_loader=test_loader,
        optimizer=optimizer,
        criterion=criterion,
        epochs=best_params['epochs'],
        patience=None,  # No early stopping
        trial=None      # Not an Optuna trial anymore
    )

    wandb.finish()

    return model

In [None]:
# BERTweet tokenizer
tokenizer = AutoTokenizer.from_pretrained("vinai/bertweet-base", use_fast=False)

final_bertweet_model = retrain_transformer_full(
    train_df=df_train,
    test_df=df_test,
    best_params=bertweet_best_params,
    tokenizer=tokenizer,
    model_name='vinai/bertweet-base',
    label_col='label',
    text_col='OriginalTweet',
    device=device
)

#### Evaluation Function

In [None]:
def evaluate_model(model, test_loader, criterion, device=None):
    # Infer device from model if not provided
    if device is None:
        device = next(model.parameters()).device

    model.eval()
    all_labels, all_preds = [], []
    total_loss, total_samples = 0.0, 0

    start_time = time.time()

    with torch.no_grad():
        for batch in test_loader:
            # Move batch to the SAME device as the model
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            # label key can be 'labels' (PyTorch dataset) or 'label' (HF dataset); here it's 'labels'
            labels = (batch.get('labels') if 'labels' in batch else batch['label']).to(device)

            outputs = model(input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            loss = criterion(logits, labels)

            total_loss += loss.item() * input_ids.size(0)
            total_samples += input_ids.size(0)

            preds = logits.argmax(dim=1).cpu().numpy()
            all_preds.extend(preds)
            all_labels.extend(labels.cpu().numpy())

    end_time = time.time()
    inference_time = end_time - start_time
    inference_time_per_sample = inference_time / max(total_samples, 1)

    avg_loss = total_loss / max(total_samples, 1)
    accuracy = accuracy_score(all_labels, all_preds)
    macro_f1 = f1_score(all_labels, all_preds, average='macro')
    macro_precision = precision_score(all_labels, all_preds, average='macro')
    macro_recall = recall_score(all_labels, all_preds, average='macro')

    # Model size (MB)
    torch.save(model.state_dict(), "tmp_model_eval.pt")
    model_size_mb = os.path.getsize("tmp_model_eval.pt") / 1e6
    os.remove("tmp_model_eval.pt")

    # Parameter count
    param_count = sum(p.numel() for p in model.parameters())

    return {
        "Loss": avg_loss,
        "F1 Score (macro)": macro_f1,
        "Accuracy": accuracy,
        "Precision (macro)": macro_precision,
        "Recall (macro)": macro_recall,
        "Inference Time (sec)": inference_time,
        "Inference Time (sec/sample)": inference_time_per_sample,
        "Model Size (Mb)": model_size_mb,
        "Parameter Count": param_count
    }

In [None]:
# Re-create test_dataset and test_loader outside
test_dataset = TweetDataset(df_test, tokenizer, text_col="OriginalTweet", label_col="label", max_len=80)
test_loader = DataLoader(test_dataset, batch_size=bertweet_best_params['batch_size'], shuffle=False)

criterion = nn.CrossEntropyLoss()

metrics = evaluate_model(final_bertweet_model, test_loader, criterion, device)
for k, v in metrics.items():
    print(f"{k}: {v:.4f}" if isinstance(v, float) else f"{k}: {v}")

Loss: 0.9788
F1 Score (macro): 0.6219
Accuracy: 0.6058
Precision (macro): 0.6258
Recall (macro): 0.6243
Inference Time (sec): 5.3786
Inference Time (sec/sample): 0.0014
Model Size (Mb): 539.6995
Parameter Count: 134903813
