In [1]:
import pandas as pd
import numpy as np

In [2]:
import torch
from transformers import pipeline
from transformers import AutoTokenizer, ModernBertForSequenceClassification

import torch.nn.functional as F

In [3]:
########

In [10]:
import torch
import torch.nn.functional as F
from transformers import AutoModelForSequenceClassification, AutoTokenizer


class RoBERTaLargeBaselineClassifier:
    def __init__(self, model_name="siebert/sentiment-roberta-large-english"):
        self.device = "cpu"

        # Load tokenizer + fine-tuned RoBERTa-large sentiment classifier
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.model = AutoModelForSequenceClassification.from_pretrained(model_name)

        self.model.to(self.device)
        self.model.eval()

        # RoBERTa-large has 24 encoder layers
        self.num_layers = self.model.config.num_hidden_layers  # = 24

    @torch.no_grad()
    def classify(self, text, dataset_name=None):

        # Tokenize
        inputs = self.tokenizer(
            text,
            return_tensors="pt",
            truncation=True,
            padding=False,
        )
        inputs = {k: v.to(self.device) for k, v in inputs.items()}

        # Forward pass through full model
        outputs = self.model(**inputs)
        logits = outputs.logits  # shape [1, 2]

        # Softmax probabilities
        probs = F.softmax(logits, dim=-1)[0]
        conf, pred = torch.max(probs, dim=0)

        # Return: predicted label, #layers used, confidence
        return pred.item(), self.num_layers, conf.item()


In [11]:
import torch
import torch.nn.functional as F
from transformers import AutoModelForSequenceClassification, AutoTokenizer


class DistilBERTBaselineClassifier:
    def __init__(self, model_name="distilbert-base-uncased-finetuned-sst-2-english"):
        self.device = "cpu"

        # Load tokenizer + fine-tuned DistilBERT sentiment classifier
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.model = AutoModelForSequenceClassification.from_pretrained(model_name)

        self.model.to(self.device)
        self.model.eval()

        # DistilBERT has 6 encoder layers
        self.num_layers = self.model.config.num_hidden_layers  # = 6

    @torch.no_grad()
    def classify(self, text, dataset_name=None):
        # Tokenize text
        inputs = self.tokenizer(
            text,
            return_tensors="pt",
            truncation=True,
            padding=False
        )
        inputs = {k: v.to(self.device) for k, v in inputs.items()}

        # Forward pass
        outputs = self.model(**inputs)
        logits = outputs.logits  # shape [1, 2]

        # Compute probabilities
        probs = F.softmax(logits, dim=-1)[0]
        conf, pred = torch.max(probs, dim=0)

        # Baseline â†’ always uses full 6 layers
        return pred.item(), self.num_layers, conf.item()

In [12]:
import time
import numpy as np

from evaluation.dataset_loaders.sst2 import load_sst2
from evaluation.dataset_loaders.agnews import load_agnews
from evaluation.dataset_loaders.amazon import load_amazon_polarity
from evaluation.dataset_loaders.imdb import load_imdb
from evaluation.dataset_loaders.dbpedia import load_dbpedia
from evaluation.dataset_loaders.yanswers import load_yahoo

dataset_loaders = [
    ("sst2", load_sst2, "classification"),
    ("imdb", load_imdb, "classification"),
    ("amazon_polarity", load_amazon_polarity, "classification")
]

cached_datasets = {}
print("Loading datasets once...\n")

for name, loader, task in dataset_loaders:
    print(f"Loading {name}...")
    cached_datasets[name] = {
        "data": loader(number=500),
        "task": task
    }

print("\nAll datasets loaded.\n")

Loading datasets once...

Loading sst2...
Loading imdb...
Loading amazon_polarity...

All datasets loaded.



In [13]:
# Extract (text, label) from any format
def extract_text_label(sample):
    if isinstance(sample, dict):
        if "text" in sample:
            return sample["text"], sample["label"]
        elif "sentence" in sample:
            return sample["sentence"], sample["label"]
        elif "input_text" in sample:
            return sample["input_text"], sample["label"]
        else:
            raise ValueError("Unknown dict format:", sample)

    if isinstance(sample, (tuple, list)):
        return sample[0], sample[1]

    raise ValueError("Unknown sample format:", sample)

# Evaluation
def evaluate_dataset(model, dataset, dataset_name):
    correct = 0
    total = 0
    layers_used = []
    total_tokens = 0

    start = time.time()

    for sample in dataset:
        text, label = extract_text_label(sample)

        pred, layer, conf = model.classify(text, dataset_name)

        correct += (pred == label)
        total += 1
        layers_used.append(layer)

        total_tokens += len(model.tokenizer(text)["input_ids"])

    end = time.time()
    latency = (end - start) / total

    return {
        "metric": "accuracy",
        "score": correct / total,
        "avg_latency_sec": latency,
        "tokens_per_sec": total_tokens / (end - start),
        "avg_layers_used": float(np.mean(layers_used)),
        "num_samples": total
    }

In [8]:
results_table = []

print("Running BASELINE DISTILBERT")

baseline_model = DistilBERTBaselineClassifier()

# Print number of layers (for logging/reporting)
print("DistilBERT model layers:", baseline_model.num_layers)

for name, meta in cached_datasets.items():
    dataset = meta["data"]
    print(f"\nTesting BASELINE on {name}...")

    result = evaluate_dataset(baseline_model, dataset, name)
    print(name, result)

    results_table.append({
        "dataset": name,
        "threshold": None,
        "mode": "baseline",
        "model": "Distil-BERT",
        "metric": result["metric"],
        "score": float(result["score"]),
        "avg_latency_sec": float(result["avg_latency_sec"]),
        "tokens_per_sec": float(result["tokens_per_sec"]),
        "avg_layers_used": float(result["avg_layers_used"]),
        "num_samples": int(result["num_samples"]),
    })

Running BASELINE

Testing BASELINE on sst2...
sst2 {'metric': 'accuracy', 'score': 0.908, 'avg_latency_sec': 0.012505305767059325, 'tokens_per_sec': 1977.080805583049, 'avg_layers_used': 6.0, 'num_samples': 500}

Testing BASELINE on imdb...


Token indices sequence length is longer than the specified maximum sequence length for this model (953 > 512). Running this sequence through the model will result in indexing errors


imdb {'metric': 'accuracy', 'score': 0.872, 'avg_latency_sec': 0.03287814998626709, 'tokens_per_sec': 9068.57594251921, 'avg_layers_used': 6.0, 'num_samples': 500}

Testing BASELINE on amazon_polarity...
amazon_polarity {'metric': 'accuracy', 'score': 0.134, 'avg_latency_sec': 0.015406723499298097, 'tokens_per_sec': 6344.762402236493, 'avg_layers_used': 6.0, 'num_samples': 500}


In [14]:
print("Running BASELINE ROBERTA")

baseline_model_roberta = RoBERTaLargeBaselineClassifier()

print("RoBERTa Large model layers:", baseline_model.num_layers)

for name, meta in cached_datasets.items():
    dataset = meta["data"]
    print(f"\nTesting BASELINE on {name}...")

    result = evaluate_dataset(baseline_model_roberta, dataset, name)
    print(name, result)

    results_table.append({
        "dataset": name,
        "threshold": None,
        "mode": "baseline",
        "model": "ROBERTA-Large",
        "metric": result["metric"],
        "score": float(result["score"]),
        "avg_latency_sec": float(result["avg_latency_sec"]),
        "tokens_per_sec": float(result["tokens_per_sec"]),
        "avg_layers_used": float(result["avg_layers_used"]),
        "num_samples": int(result["num_samples"]),
    })

Running BASELINE ROBERTA


tokenizer_config.json:   0%|          | 0.00/256 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/687 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/150 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]


Testing BASELINE on sst2...
sst2 {'metric': 'accuracy', 'score': 0.906, 'avg_latency_sec': 0.059351471900939944, 'tokens_per_sec': 430.8907459394446, 'avg_layers_used': 24.0, 'num_samples': 500}

Testing BASELINE on imdb...


Token indices sequence length is longer than the specified maximum sequence length for this model (907 > 512). Running this sequence through the model will result in indexing errors


imdb {'metric': 'accuracy', 'score': 0.946, 'avg_latency_sec': 0.21600387859344483, 'tokens_per_sec': 1324.531771665523, 'avg_layers_used': 24.0, 'num_samples': 500}

Testing BASELINE on amazon_polarity...
amazon_polarity {'metric': 'accuracy', 'score': 0.04, 'avg_latency_sec': 0.09732852172851562, 'tokens_per_sec': 981.8088089999533, 'avg_layers_used': 24.0, 'num_samples': 500}


In [15]:
import pandas as pd
df=pd.DataFrame(results_table)
df_sorted = (
    df.groupby("dataset", group_keys=True)
      .apply(lambda g: g.sort_values("score", ascending=False))
      .reset_index(drop=True)
)
df_sorted

  .apply(lambda g: g.sort_values("score", ascending=False))


Unnamed: 0,dataset,threshold,mode,model,metric,score,avg_latency_sec,tokens_per_sec,avg_layers_used,num_samples
0,amazon_polarity,,baseline,Distil-BERT,accuracy,0.134,0.015407,6344.762402,6.0,500
1,amazon_polarity,,baseline,ROBERTA-Large,accuracy,0.04,0.097329,981.808809,24.0,500
2,imdb,,baseline,ROBERTA-Large,accuracy,0.946,0.216004,1324.531772,24.0,500
3,imdb,,baseline,Distil-BERT,accuracy,0.872,0.032878,9068.575943,6.0,500
4,sst2,,baseline,Distil-BERT,accuracy,0.908,0.012505,1977.080806,6.0,500
5,sst2,,baseline,ROBERTA-Large,accuracy,0.906,0.059351,430.890746,24.0,500
