In [1]:
!pip install transformers evaluate scikit-learn scipy pandas



In [11]:
import torch
import copy
import pandas as pd 
from torch import nn
from transformers import AutoModelForSequenceClassification, AutoTokenizer

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

PRETRAIN_MODEL = "cardiffnlp/twitter-roberta-base-sentiment"
BEST_TRIAL = 11

#### Load data

In [None]:
train_df = pd.read_csv("data/split_train_data.csv").dropna(subset=["Label"]).reset_index(drop=True)
eval_df = pd.read_csv("data/split_eval_data.csv").dropna(subset=["Label"]).reset_index(drop=True)

#### Load Best Model

In [13]:
CKPT_PT = f"models/training/roberta_best_model_trial_{BEST_TRIAL}.pt"
tokenizer = AutoTokenizer.from_pretrained(PRETRAIN_MODEL, use_fast=True)
best_model = AutoModelForSequenceClassification.from_pretrained(PRETRAIN_MODEL, num_labels=5, ignore_mismatched_sizes=True)

state = torch.load(CKPT_PT, map_location="cpu")
best_model.load_state_dict(state) 
best_model.eval()
baseline_num_params = sum(p.numel() for p in best_model.parameters())

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment and are newly initialized because the shapes did not match:
- classifier.out_proj.weight: found shape torch.Size([3, 768]) in the checkpoint and torch.Size([5, 768]) in the model instantiated
- classifier.out_proj.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([5]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


#### 1. QUANTIZATION DEMO

In [14]:
from torch.ao.quantization import quantize_dynamic

# Apply dynamic quantization on Linear layers
quantized_model = quantize_dynamic(best_model, {nn.Linear}, dtype=torch.qint8)

quantized_num_params = sum(p.numel() for p in quantized_model.parameters())
print("Original model params:", baseline_num_params)
print("Quantized model params:", quantized_num_params)

Original model params: 124649477
Quantized model params: 39037440


#### 2. Pruning

In [15]:
from torch.nn.utils import prune

# Only prune FFN dense layers, not attention or classifier
def collect_ffn_linears(model):
    pairs = []
    for name, module in model.named_modules():
        if isinstance(module, nn.Linear):
            if ".intermediate.dense" in name or ".output.dense" in name:
                pairs.append((module, "weight"))
    return pairs

model_to_prune = copy.deepcopy(best_model)
parameters_to_prune = collect_ffn_linears(model_to_prune)

# Global L1 pruning across FFN weights (40%)
prune.global_unstructured(parameters_to_prune, prune.L1Unstructured, amount=0.4)

# Remove masks (make pruning permanent)
for m, _ in parameters_to_prune:
    prune.remove(m, "weight")


print("Pruning applied: 40% of weights zeroed out in Linear layers.")
pruned_num_params = sum((p != 0).sum().item() for p in model_to_prune.parameters())
print("Remaining non-zero parameters:", pruned_num_params)



Pruning applied: 40% of weights zeroed out in Linear layers.
Remaining non-zero parameters: 99168312


#### 3 Distillation

In [16]:
from torch.nn import functional as F
from transformers import Trainer, TrainingArguments

# Teacher = your best trained model
teacher = best_model.to(device)
teacher.eval()
for p in teacher.parameters():
    p.requires_grad_(False)

# Student = smaller pre-trained model
student_name = "distilbert/distilroberta-base"
student = AutoModelForSequenceClassification.from_pretrained(student_name, num_labels=5).to(device)

class DistillationTrainer(Trainer):
    def __init__(self, *args, teacher_model=None, temperature=2.0, alpha=0.5, **kwargs):
        super().__init__(*args, **kwargs)
        self.teacher = teacher_model
        self.temperature = temperature
        self.alpha = alpha

    def compute_loss(self, model, inputs, return_outputs=False, num_items_in_batch=None):
        labels = inputs.pop("labels")
        outputs_student = model(**inputs)
        with torch.no_grad():
            #outputs_teacher = self.teacher(**inputs)
            #outputs_teacher = self.teacher(**inputs)
            outputs_teacher = self.teacher(
                input_ids=inputs["input_ids"],
                attention_mask=inputs.get("attention_mask", None)
            )

        labels = labels.to(outputs_student.logits.device).long().view(-1)
        # labels = inputs["labels"].to(outputs_student.logits.device)
        # labels = labels.long().view(-1)

        loss_ce = F.cross_entropy(outputs_student.logits, labels)
        loss_kl = F.kl_div(
            F.log_softmax(outputs_student.logits / self.temperature, dim=-1),
            F.softmax(outputs_teacher.logits / self.temperature, dim=-1),
            reduction="batchmean",
        ) * (self.temperature ** 2)

        loss = self.alpha * loss_ce + (1 - self.alpha) * loss_kl
        return (loss, outputs_student) if return_outputs else loss

# Wrap your datasets in torch Dataset -> DataLoader is handled by Trainer
from datasets import Dataset as HFDataset

train_hf = HFDataset.from_pandas(train_df.rename(columns={"Label": "labels"}))
eval_hf = HFDataset.from_pandas(eval_df.rename(columns={"Label": "labels"}))

def tokenize_fn(batch):
    texts = [str(x) for x in batch["OriginalTweet"]] 
    tokenized = tokenizer(texts, padding="max_length", truncation=True, max_length=128)
    tokenized["labels"] = [int(x) for x in batch["labels"]]
    return tokenized

#train_tok.reset_format()
#eval_tok.reset_format()
train_tok = train_hf.map(tokenize_fn, batched=True)
eval_tok = eval_hf.map(tokenize_fn, batched=True)
train_tok.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])
eval_tok.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at distilbert/distilroberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/28808 [00:00<?, ? examples/s]

Map:   0%|          | 0/12348 [00:00<?, ? examples/s]

In [17]:
args = TrainingArguments(
    output_dir="./distill_out",
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    eval_strategy="epoch",
    logging_steps=50,
    save_strategy="no",
)

trainer = DistillationTrainer(
    model=student,
    teacher_model=teacher,
    args=args,
    train_dataset=train_tok,
    eval_dataset=eval_tok,
)

trainer.train()

[34m[1mwandb[0m: Currently logged in as: [33myoyulia[0m ([33myoyulia-tel-aviv-university[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Epoch,Training Loss,Validation Loss
1,0.6873,0.721515
2,0.4428,0.679479
3,0.3317,0.604971


TrainOutput(global_step=5403, training_loss=0.6221571788839911, metrics={'train_runtime': 879.6874, 'train_samples_per_second': 98.244, 'train_steps_per_second': 6.142, 'total_flos': 2862243739330560.0, 'train_loss': 0.6221571788839911, 'epoch': 3.0})

In [18]:
print("\nDistillation complete. Student model trained.")
distilled_num_params = sum(p.numel() for p in best_model.parameters())
print("Student model size:", distilled_num_params)


Distillation complete. Student model trained.
Student model size: 124649477


In [None]:
import os
import torch

model_name = PRETRAIN_MODEL.split("/")[1]
output_model_path = f"models/{model_name}-full"

os.makedirs(output_model_path, exist_ok=True)

### Save compress models

In [20]:
# Baseline
best_model.save_pretrained(f"{output_model_path}/baseline")
tokenizer.save_pretrained(f"{output_model_path}/baseline")

# Quantized
torch.save(
    quantized_model.state_dict(),
    f"{output_model_path}/quantized.pt"
)
tokenizer.save_pretrained(f"{output_model_path}/quantized")  # same tokenizer

# Pruned
model_to_prune.save_pretrained(f"{output_model_path}/pruned")
tokenizer.save_pretrained(f"{output_model_path}/pruned")

# Distilled
student.save_pretrained(f"{output_model_path}/distilled")
tokenizer.save_pretrained(f"{output_model_path}/distilled")

('models/2-twitter-roberta-base-sentiment-full/distilled/tokenizer_config.json',
 'models/2-twitter-roberta-base-sentiment-full/distilled/special_tokens_map.json',
 'models/2-twitter-roberta-base-sentiment-full/distilled/vocab.json',
 'models/2-twitter-roberta-base-sentiment-full/distilled/merges.txt',
 'models/2-twitter-roberta-base-sentiment-full/distilled/added_tokens.json',
 'models/2-twitter-roberta-base-sentiment-full/distilled/tokenizer.json')

### Load compress models (if needed)

In [None]:
# from transformers import AutoModelForSequenceClassification, AutoTokenizer

# # Load Baseline
# student = AutoModelForSequenceClassification.from_pretrained(f"{output_model_path}/baseline")
# distilled_tokenizer = AutoTokenizer.from_pretrained(f"{output_model_path}/baseline")

# # Load Quantized (CPU only!)
# quantized_tokenizer = AutoTokenizer.from_pretrained(f"{output_model_path}/quantized")
# quantized_model = AutoModelForSequenceClassification.from_pretrained(
#     PRETRAIN_MODEL, num_labels=5
# )
# quantized_model.load_state_dict(torch.load(f"{output_model_path}/quantized.pt"))
# quantized_model = quantize_dynamic(quantized_model, {torch.nn.Linear}, dtype=torch.qint8)

# # Load Pruned
# model_to_prune = AutoModelForSequenceClassification.from_pretrained(f"{output_model_path}/pruned")
# pruned_tokenizer = AutoTokenizer.from_pretrained(f"{output_model_path}/pruned")

# # Load Distilled
# student = AutoModelForSequenceClassification.from_pretrained(f"{output_model_path}/distilled")
# distilled_tokenizer = AutoTokenizer.from_pretrained(f"{output_model_path}/distilled")

### Compare the models

In [None]:
import wandb

model_name = PRETRAIN_MODEL.split("/")[1]
wandb.init(
    project=f"model_compression-{model_name}",
    name=f"Baseline-Quantized-Pruned-Distilled-{model_name}",
    mode="online"  # use "offline" if no internet
)

import evaluate
from sklearn.metrics import roc_auc_score
from scipy.special import softmax
import numpy as np
import pandas as pd
from transformers import Trainer

# Define compute_metrics
metric_acc = evaluate.load("accuracy")
metric_precision = evaluate.load("precision")
metric_recall = evaluate.load("recall")
metric_f1 = evaluate.load("f1")

def compute_ce_loss(outputs, labels, num_items_in_batch=None):
    labels = labels.long().view(-1)
    logits = outputs.logits
    return F.cross_entropy(logits, labels)

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)

    accuracy = metric_acc.compute(predictions=predictions, references=labels)
    precision = metric_precision.compute(predictions=predictions, references=labels, average="macro")
    recall = metric_recall.compute(predictions=predictions, references=labels, average="macro")
    f1 = metric_f1.compute(predictions=predictions, references=labels, average="macro")

    probs = softmax(logits, axis=1)
    try:
        auc = roc_auc_score(labels, probs, multi_class="ovr", average="macro")
    except ValueError:
        auc = float("nan")

    return {
        "accuracy": accuracy["accuracy"],
        "precision": precision["precision"],
        "recall": recall["recall"],
        "f1": f1["f1"],
        "auc": auc,
    }

# Create lightweight Trainer wrapper for evaluation
def evaluate_model(model, name):
    use_cuda = torch.cuda.is_available() and name != "Quantized"
    model.to("cuda" if use_cuda else "cpu")
    args = TrainingArguments(
        output_dir="./tmp",
        per_device_eval_batch_size=16,
        no_cuda=not use_cuda,
        report_to="none",
    )
    
    trainer = Trainer(model=model, args=args, eval_dataset=eval_tok, compute_metrics=compute_metrics, compute_loss_func=compute_ce_loss)
    results = trainer.evaluate()
    
    dense_params = sum(p.numel() for p in model.parameters())
    nonzero_params = sum((p != 0).sum().item() for p in model.parameters())
    results.update({
        "params_dense": dense_params,
        "params_nonzero": nonzero_params,
        "model": name,
    })
    return results

# Evaluate all models
results = []
results.append(evaluate_model(model_to_prune, "Pruned"))
results.append(evaluate_model(student, "Distilled"))
results.append(evaluate_model(best_model, "Baseline"))
results.append(evaluate_model(quantized_model, "Quantized"))


# Put results in DataFrame
results_df = pd.DataFrame(results)
results_df = results_df[["model", "params_dense", "params_nonzero",
                         "eval_loss", "eval_accuracy", "eval_precision",
                         "eval_recall", "eval_f1", "eval_auc"]]

0,1
eval/loss,█▅▁
eval/runtime,▁▆█
eval/samples_per_second,█▃▁
eval/steps_per_second,█▃▁
train/epoch,▁▁▁▁▂▂▂▂▂▂▂▃▃▃▃▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇█████
train/global_step,▁▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▅▅▅▆▆▇▇▇▇████
train/grad_norm,▄▇▆▇▆▅▄█▃▄▄▃▃▅▁▄▂▃▁▂▄▁▅▂▁▃▂▃▅▄▄▃▄▂▂▂▂▂▂▃
train/learning_rate,███▇▇▇▇▇▇▇▆▆▆▆▆▆▅▅▅▅▄▄▄▄▄▃▃▃▃▃▂▂▂▂▂▂▂▁▁▁
train/loss,█▅▄▅▄▃▄▃▃▃▃▃▃▂▂▂▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
eval/loss,0.60497
eval/runtime,53.2275
eval/samples_per_second,231.985
eval/steps_per_second,14.504
total_flos,2862243739330560.0
train/epoch,3.0
train/global_step,5403.0
train/grad_norm,11.6251
train/learning_rate,0.0
train/loss,0.3317




KeyError: "['params'] not in index"

In [23]:
results_df

Unnamed: 0,model,params_dense,params_nonzero,eval_loss,eval_accuracy,eval_precision,eval_recall,eval_f1,eval_auc
0,Pruned,124649477,99168312,1.292657,0.580661,0.714066,0.527425,0.513991,0.896321
1,Distilled,82122245,82121477,0.576013,0.799401,0.801039,0.821834,0.805277,0.964596
2,Baseline,124649477,124648709,0.788653,0.722708,0.724727,0.748257,0.731896,0.935407
3,Quantized,39037440,39036672,0.83008,0.709022,0.714616,0.724792,0.71871,0.927124


In [24]:
results_df.to_csv(f"{output_model_path}/comparison.csv")
results_df

Unnamed: 0,model,params_dense,params_nonzero,eval_loss,eval_accuracy,eval_precision,eval_recall,eval_f1,eval_auc
0,Pruned,124649477,99168312,1.292657,0.580661,0.714066,0.527425,0.513991,0.896321
1,Distilled,82122245,82121477,0.576013,0.799401,0.801039,0.821834,0.805277,0.964596
2,Baseline,124649477,124648709,0.788653,0.722708,0.724727,0.748257,0.731896,0.935407
3,Quantized,39037440,39036672,0.83008,0.709022,0.714616,0.724792,0.71871,0.927124
