In [None]:
import sys
import os

!git clone https://github.com/komalniraula/adaptive-inference-llm

repo_name = 'adaptive-inference-llm' # Must match the folder created by git clone
project_path = os.path.join('/content', repo_name)

# Append the project root directory to the system path

sys.path.append(project_path)

fatal: destination path 'adaptive-inference-llm' already exists and is not an empty directory.


In [None]:
import pandas as pd
import numpy as np

In [None]:
import torch
from transformers import pipeline
from transformers import AutoTokenizer, ModernBertForSequenceClassification

import torch.nn.functional as F

In [None]:
# Load dataset
import time

from evaluation.dataset_loaders.sst2 import load_sst2

dataset_loaders = [
    ("sst2", load_sst2, "classification"),
    ]

cached_datasets = {}
print("Loading datasets once...\n")

for name, loader, task in dataset_loaders:
    print(f"Loading {name}...")
    cached_datasets[name] = {
        "data": loader(task='test', fraction=1),
        "task": task
    }

for data in cached_datasets:
    print(f"Length of the {data} dataset: {len(cached_datasets[data]['data'])}")

print("\nAll datasets loaded.\n")

Loading datasets once...

Loading sst2...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Length of the sst2 dataset: 872

All datasets loaded.



In [None]:
from transformers import AutoModelForSequenceClassification


class RoBERTaLargeBaselineClassifier:
    def __init__(self, model_name="siebert/sentiment-roberta-large-english"):
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

        # Load tokenizer + fine-tuned RoBERTa-large sentiment classifier
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.model = AutoModelForSequenceClassification.from_pretrained(model_name)

        self.model.to(self.device)
        self.model.eval()

        # RoBERTa-large has 24 encoder layers
        self.num_layers = self.model.config.num_hidden_layers  # = 24

    @torch.no_grad()
    def classify(self, text, dataset_name=None):

        # Tokenize
        inputs = self.tokenizer(
            text,
            return_tensors="pt",
            truncation=True,
            padding=False,
        )
        inputs = {k: v.to(self.device) for k, v in inputs.items()}

        # Forward pass through full model
        outputs = self.model(**inputs)
        logits = outputs.logits  # shape [1, 2]

        # Softmax probabilities
        probs = F.softmax(logits, dim=-1)[0]
        conf, pred = torch.max(probs, dim=0)

        # Return: predicted label, #layers used, confidence
        return pred.item(), self.num_layers, conf.item()


In [None]:
class DistilBERTBaselineClassifier:
    def __init__(self, model_name="distilbert-base-uncased-finetuned-sst-2-english"):
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

        # Load tokenizer + fine-tuned DistilBERT sentiment classifier
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.model = AutoModelForSequenceClassification.from_pretrained(model_name)

        self.model.to(self.device)
        self.model.eval()

        # DistilBERT has 6 encoder layers
        self.num_layers = self.model.config.num_hidden_layers  # = 6

    @torch.no_grad()
    def classify(self, text, dataset_name=None):
        # Tokenize text
        inputs = self.tokenizer(
            text,
            return_tensors="pt",
            truncation=True,
            padding=False
        )
        inputs = {k: v.to(self.device) for k, v in inputs.items()}

        # Forward pass
        outputs = self.model(**inputs)
        logits = outputs.logits  # shape [1, 2]

        # Compute probabilities
        probs = F.softmax(logits, dim=-1)[0]
        conf, pred = torch.max(probs, dim=0)

        # Baseline â†’ always uses full 6 layers
        return pred.item(), self.num_layers, conf.item()

In [None]:
# Extract (text, label) from any format
def extract_text_label(sample):
    if isinstance(sample, dict):
        if "text" in sample:
            return sample["text"], sample["label"]
        elif "sentence" in sample:
            return sample["sentence"], sample["label"]
        elif "input_text" in sample:
            return sample["input_text"], sample["label"]
        else:
            raise ValueError("Unknown dict format:", sample)

    if isinstance(sample, (tuple, list)):
        return sample[0], sample[1]

    raise ValueError("Unknown sample format:", sample)

# Evaluation
def evaluate_dataset(model, dataset, dataset_name):
    correct = 0
    total = 0
    layers_used = []
    total_tokens = 0

    start = time.time()

    for sample in dataset:
        text, label = extract_text_label(sample)

        pred, layer, conf = model.classify(text, dataset_name)

        correct += (pred == label)
        total += 1
        layers_used.append(layer)

        total_tokens += len(model.tokenizer(text)["input_ids"])

    end = time.time()
    latency = (end - start) / total

    return {
        "metric": "accuracy",
        "score": correct / total,
        "avg_latency_sec": latency,
        "tokens_per_sec": total_tokens / (end - start),
        "avg_layers_used": float(np.mean(layers_used)),
        "num_samples": total
    }

In [None]:
results_table = []

print("Running BASELINE DISTILBERT finetuned on SST2")

baseline_model = DistilBERTBaselineClassifier()

# Print number of layers (for logging/reporting)
print("DistilBERT sst2 finetuned model layers:", baseline_model.num_layers)

for name, meta in cached_datasets.items():
    dataset = meta["data"]
    print(f"\nTesting BASELINE on {name}...")

    result = evaluate_dataset(baseline_model, dataset, name)
    print(name, result)

    results_table.append({
        "dataset": name,
        "threshold": None,
        "mode": "baseline",
        "model": "Distil-BERT sst finetuned",
        "metric": result["metric"],
        "score": float(result["score"]),
        "avg_latency_sec": float(result["avg_latency_sec"]),
        "tokens_per_sec": float(result["tokens_per_sec"]),
        "avg_layers_used": float(result["avg_layers_used"]),
        "num_samples": int(result["num_samples"]),
    })

Running BASELINE DISTILBERT finetuned on SST2
DistilBERT sst2 finetuned model layers: 6

Testing BASELINE on sst2...
sst2 {'metric': 'accuracy', 'score': 0.9105504587155964, 'avg_latency_sec': 0.004897614684673624, 'tokens_per_sec': 5138.009509901858, 'avg_layers_used': 6.0, 'num_samples': 872}


In [None]:
print("Running BASELINE DISTILBERT uncased")

baseline_model_distil_uncased = DistilBERTBaselineClassifier(model_name="distilbert/distilbert-base-uncased")

# Print number of layers (for logging/reporting)
print("DistilBERT uncased model layers:", baseline_model_distil_uncased.num_layers)

for name, meta in cached_datasets.items():
    dataset = meta["data"]
    print(f"\nTesting BASELINE on {name}...")

    result = evaluate_dataset(baseline_model, dataset, name)
    print(name, result)

    results_table.append({
        "dataset": name,
        "threshold": None,
        "mode": "baseline",
        "model": "Distil-BERT uncased",
        "metric": result["metric"],
        "score": float(result["score"]),
        "avg_latency_sec": float(result["avg_latency_sec"]),
        "tokens_per_sec": float(result["tokens_per_sec"]),
        "avg_layers_used": float(result["avg_layers_used"]),
        "num_samples": int(result["num_samples"]),
    })

Running BASELINE DISTILBERT uncased


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert/distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


DistilBERT uncased model layers: 6

Testing BASELINE on sst2...
sst2 {'metric': 'accuracy', 'score': 0.9105504587155964, 'avg_latency_sec': 0.004632979904839752, 'tokens_per_sec': 5431.491468245093, 'avg_layers_used': 6.0, 'num_samples': 872}


In [None]:
print("Running BASELINE ROBERTA")

baseline_model_roberta = RoBERTaLargeBaselineClassifier()

print("RoBERTa Large model layers:", baseline_model_roberta.num_layers)

for name, meta in cached_datasets.items():
    dataset = meta["data"]
    print(f"\nTesting BASELINE on {name}...")

    result = evaluate_dataset(baseline_model_roberta, dataset, name)
    print(name, result)

    results_table.append({
        "dataset": name,
        "threshold": None,
        "mode": "baseline",
        "model": "ROBERTA-Large",
        "metric": result["metric"],
        "score": float(result["score"]),
        "avg_latency_sec": float(result["avg_latency_sec"]),
        "tokens_per_sec": float(result["tokens_per_sec"]),
        "avg_layers_used": float(result["avg_layers_used"]),
        "num_samples": int(result["num_samples"]),
    })

Running BASELINE ROBERTA
RoBERTa Large model layers: 24

Testing BASELINE on sst2...
sst2 {'metric': 'accuracy', 'score': 0.9243119266055045, 'avg_latency_sec': 0.015610218321511505, 'tokens_per_sec': 1666.1633922239846, 'avg_layers_used': 24.0, 'num_samples': 872}


In [None]:
df=pd.DataFrame(results_table)
df_sorted = (
    df.groupby("dataset", group_keys=True)
      .apply(lambda g: g.sort_values("score", ascending=False))
      .reset_index(drop=True)
)
df_sorted

  .apply(lambda g: g.sort_values("score", ascending=False))


Unnamed: 0,dataset,threshold,mode,model,metric,score,avg_latency_sec,tokens_per_sec,avg_layers_used,num_samples
0,sst2,,baseline,ROBERTA-Large,accuracy,0.924312,0.01561,1666.163392,24.0,872
1,sst2,,baseline,Distil-BERT sst finetuned,accuracy,0.91055,0.004898,5138.00951,6.0,872
2,sst2,,baseline,Distil-BERT uncased,accuracy,0.91055,0.004633,5431.491468,6.0,872
