# Natural Languange Processing (NLP)- Advanced Topics in DL

Group Z:</br>
    - Iliya Morgunov - 206361412</br>
    - Eadan Schechter - 209793553

Inference on Fine-Tuned, Compressed, and Distilled Models.
This notebook loads all previously saved `.pt` models (fine-tuned, quantized, pruned, and distilled) and evaluates them on the test dataset `df_test_final.csv`.

In [1]:
import os
import time
import re
import shutil
import warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd

from sklearn.metrics import (
    f1_score, accuracy_score, precision_score, recall_score
)

import torch
from torch import nn
from torch.utils.data import DataLoader, Dataset
import torch.nn.functional as F
from torch.quantization import quantize_dynamic

from transformers import AutoTokenizer, AutoModelForSequenceClassification

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

Load Test Data

In [2]:
# Expect to be in the same directory as df_test_final.csv
df_test = pd.read_csv("df_test_final.csv")

# Map Sentiment column to integer labels (same order as in the fine-tuning notebook)
sentiment_classes = ['Negative', 'Neutral', 'Positive', 'Extremely Negative', 'Extremely Positive']
label2id = {label: idx for idx, label in enumerate(sentiment_classes)}

# Create the 'label' column used by the dataset class
if 'label' not in df_test.columns:
    df_test['label'] = df_test['Sentiment'].map(label2id).astype(int)

print("Test shape:", df_test.shape)
df_test[['TweetWithDateLocation', 'Sentiment', 'label']].head()

Test shape: (3798, 11)


Unnamed: 0,TweetWithDateLocation,Sentiment,label
0,[tweet] TRENDING: New Yorkers encounter empty ...,Extremely Negative,3
1,[tweet] When I couldn't find hand sanitizer at...,Positive,2
2,[tweet] Find out how you can protect yourself ...,Extremely Positive,4
3,[tweet] buying hits City as anxious shoppers s...,Negative,0
4,[tweet] One week everyone buying baby milk pow...,Neutral,1


Dataset & Tokenization

In [3]:
class TweetDataset(Dataset):
    """
    PyTorch Dataset for tweet sentiment analysis using a DataFrame.
    Processes each row to create tokenized input for transformer models.
    """

    def __init__(self, dataframe, tokenizer, text_col="TweetWithDateLocation", label_col="label", max_len=128):
        self.texts = dataframe[text_col].astype(str).tolist()
        self.labels = dataframe[label_col].tolist()
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]

        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )

        return {
            'input_ids': encoding['input_ids'].squeeze(),
            'attention_mask': encoding['attention_mask'].squeeze(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

Evaluation Function

In [4]:
def evaluate_model(model, dataloader, eval_device=None):
    """
    Returns dict with: Accuracy, Macro F1, Precision, Recall
    """
    if eval_device is None:
        eval_device = next(model.parameters()).device

    model.eval()
    all_preds, all_probs, all_labels = [], [], []
    with torch.no_grad():
        for batch in dataloader:
            input_ids = batch['input_ids'].to(eval_device)
            attention_mask = batch['attention_mask'].to(eval_device)
            labels = batch['labels'].to(eval_device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            probs = F.softmax(logits, dim=-1)
            preds = probs.argmax(dim=1)

            all_probs.append(probs.cpu().numpy())
            all_preds.append(preds.cpu().numpy())
            all_labels.append(labels.cpu().numpy())

    y_prob = np.concatenate(all_probs, axis=0)
    y_pred = np.concatenate(all_preds, axis=0)
    y_true = np.concatenate(all_labels, axis=0)

    metrics = {
        "Accuracy": accuracy_score(y_true, y_pred),
        "Macro F1": f1_score(y_true, y_pred, average='macro'),
        "Precision": precision_score(y_true, y_pred, average='macro', zero_division=0),
        "Recall": recall_score(y_true, y_pred, average='macro', zero_division=0),
    }

    return metrics

Load All Models

In [5]:
import os
import re

MODEL_DIR = "."  # same directory as notebook

def infer_backbone_from_filename(fname: str) -> str:
    """
    Map file names to the exact backbones used in training.
    We intentionally avoid mapping to 'deberta-v3' to prevent tokenizer/type issues.
    """
    f = fname.lower()

    # BERTweet
    if "bertweet" in f or "vinai-bertweet-base" in f:
        return "vinai/bertweet-base"

    # DeBERTa (base) – this is what the original notebook fine-tuned
    if "deberta-base" in f or "microsoft-deberta-base" in f or "deberta_base" in f:
        return "microsoft/deberta-base"

    # Distilled students from KD section:
    if "bert-tiny" in f or "prajjwal1-bert-tiny" in f:
        return "prajjwal1/bert-tiny"
    if "deberta-v3-small" in f or "microsoft-deberta-v3-small" in f:
        return "microsoft/deberta-v3-small"

    # Fallback to DeBERTa base if something DeBERTa-ish slipped through
    if "deberta" in f:
        return "microsoft/deberta-base"

    # As very last resort, try BERTweet
    return "vinai/bertweet-base"


def is_quantized_file(fname: str) -> bool:
    return "_quantized" in fname.lower()


def sorted_pt_files_in_dir(d: str = "."):
    files = [f for f in os.listdir(d) if f.endswith(".pt")]
    files.sort()
    return files

Run Evaluation for Each Model

In [6]:
import torch
from torch import nn
from torch.quantization import quantize_dynamic
from transformers import AutoTokenizer, AutoConfig, AutoModelForSequenceClassification

def load_finetuned_model(backbone: str, state_dict_path: str, num_labels: int = 5, quantized: bool = False):
    """
    Build architecture from CONFIG (no hub weights), then load the saved .pt state_dict.
    No HF-folder or safetensors fallback — .pt only.
    """
    eval_device = torch.device("cpu") if quantized else device

    # Robust torch.load for state_dict
    state = None
    try:
        state = torch.load(state_dict_path, map_location=eval_device)
    except Exception as e:
        # try PyTorch >=2.2 weights_only path if available
        try:
            state = torch.load(state_dict_path, map_location=eval_device, weights_only=True)
        except Exception:
            raise RuntimeError(f"torch.load failed on '{state_dict_path}': {e}")

    if not isinstance(state, dict):
        raise RuntimeError(f"Loaded object from '{state_dict_path}' is not a state_dict (dict).")

    # Build empty architecture from config (prevents 'newly initialized' head warning)
    cfg = AutoConfig.from_pretrained(backbone, num_labels=num_labels)
    model = AutoModelForSequenceClassification.from_config(cfg)

    # Optional: dynamic int8 quantization (CPU-only)
    if quantized:
        model = quantize_dynamic(model, {nn.Linear}, dtype=torch.qint8)
        eval_device = torch.device("cpu")

    # Load weights (non-strict for minor HF version diffs)
    missing, unexpected = model.load_state_dict(state, strict=False)
    if missing or unexpected:
        print("load_state_dict info -> missing:", missing, "| unexpected:", unexpected)

    model.to(eval_device).eval()
    return model, eval_device


# ===== Inference loop (same tokenizer style as training: use_fast=False) =====
results = []

all_model_files = sorted_pt_files_in_dir(MODEL_DIR)
print("Discovered .pt files:", all_model_files)

for fname in all_model_files:
    backbone = infer_backbone_from_filename(fname)
    quantized = is_quantized_file(fname)
    print(f"\n>>> Evaluating: {fname}\n    Backbone: {backbone}\n    Quantized: {quantized}")

    tokenizer = AutoTokenizer.from_pretrained(backbone, use_fast=False)

    test_dataset = TweetDataset(
        df_test, tokenizer,
        text_col="TweetWithDateLocation",
        label_col="label",
        max_len=128
    )
    test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

    model, eval_device = load_finetuned_model(backbone, os.path.join(MODEL_DIR, fname), num_labels=5, quantized=quantized)

    # NOTE: this matches your current evaluate_model signature you’re calling with (model, loader, device)
    metrics = evaluate_model(model, test_loader, eval_device)
    metrics["Model"] = fname
    results.append(metrics)

Discovered .pt files: ['microsoft-deberta-base_fine_tuned_hf_model.pt', 'vinai-bertweet-base_fine_tuned_hf_model_quantized.pt', 'vinai-bertweet-base_fine_tuned_pytorch_model.pt', 'vinai-bertweet-base_fine_tuned_pytorch_model_pruned.pt', 'vinai-bertweet-base_fine_tuned_pytorch_model_quantized.pt']

>>> Evaluating: microsoft-deberta-base_fine_tuned_hf_model.pt
    Backbone: microsoft/deberta-base
    Quantized: False

>>> Evaluating: vinai-bertweet-base_fine_tuned_hf_model_quantized.pt
    Backbone: vinai/bertweet-base
    Quantized: True


emoji is not installed, thus not converting emoticons or emojis into text. Install emoji: pip3 install emoji==0.6.0



>>> Evaluating: vinai-bertweet-base_fine_tuned_pytorch_model.pt
    Backbone: vinai/bertweet-base
    Quantized: False


emoji is not installed, thus not converting emoticons or emojis into text. Install emoji: pip3 install emoji==0.6.0



>>> Evaluating: vinai-bertweet-base_fine_tuned_pytorch_model_pruned.pt
    Backbone: vinai/bertweet-base
    Quantized: False


emoji is not installed, thus not converting emoticons or emojis into text. Install emoji: pip3 install emoji==0.6.0



>>> Evaluating: vinai-bertweet-base_fine_tuned_pytorch_model_quantized.pt
    Backbone: vinai/bertweet-base
    Quantized: True


emoji is not installed, thus not converting emoticons or emojis into text. Install emoji: pip3 install emoji==0.6.0


Collect Results

In [8]:
df_results = pd.DataFrame(results)
df_results = df_results[["Model", "Accuracy", "Macro F1", "Precision", "Recall"]]

print("\n=== Final Results ===")
print(df_results)

# Save
df_results.to_csv("model_comparison_results.csv", index=False)


=== Final Results ===
                                               Model  Accuracy  Macro F1  \
0      microsoft-deberta-base_fine_tuned_hf_model.pt  0.631122  0.644625   
1  vinai-bertweet-base_fine_tuned_hf_model_quanti...  0.301211  0.240275   
2    vinai-bertweet-base_fine_tuned_pytorch_model.pt  0.594787  0.610027   
3  vinai-bertweet-base_fine_tuned_pytorch_model_p...  0.572670  0.580404   
4  vinai-bertweet-base_fine_tuned_pytorch_model_q...  0.307794  0.168288   

   Precision    Recall  
0   0.647044  0.642687  
1   0.347363  0.309262  
2   0.617176  0.604966  
3   0.601918  0.582152  
4   0.568959  0.235898  
