In [3]:
!pip install datasets accelerate hf-transfer



In [4]:
!pip install evaluate



In [5]:
from datasets import load_dataset
# Load mnli dataset
dataset = load_dataset("glue", "mnli")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [6]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer

# Load tokenizer and models
tokenizer = AutoTokenizer.from_pretrained("answerdotai/ModernBERT-large")

teacher_cls = AutoModelForSequenceClassification.from_pretrained(
    "answerdotai/ModernBERT-large",
    num_labels=3,
    problem_type="single_label_classification"
)
student_cls = AutoModelForSequenceClassification.from_pretrained(
    "kartikeya-pandey/MiniModernBERT-Pretrained",
    num_labels=3,
    problem_type="single_label_classification"
)

teacher_cls = teacher_cls.to("cuda")
student_cls = student_cls.to("cuda")

Some weights of ModernBertForSequenceClassification were not initialized from the model checkpoint at answerdotai/ModernBERT-large and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of ModernBertForSequenceClassification were not initialized from the model checkpoint at kartikeya-pandey/MiniModernBERT-Pretrained and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [7]:
# Preprocess function for MNLI
def preprocess(example):
    return tokenizer(
        example["premise"],
        example["hypothesis"],
        truncation=True,
        padding="max_length",
        max_length=128
    )

In [8]:
tokenized_dataset = dataset.map(preprocess, batched=True)

Map:   0%|          | 0/9815 [00:00<?, ? examples/s]

In [9]:
import torch
import torch.nn.functional as F
from torch.nn import KLDivLoss, CrossEntropyLoss

# Distillation loss (same as QQP)
def compute_distill_loss(student_logits, teacher_logits, labels, alpha=0.6, temperature=1.0):
    kl_loss = KLDivLoss(reduction="batchmean")(
        F.log_softmax(student_logits / temperature, dim=-1),
        F.softmax(teacher_logits / temperature, dim=-1)
    ) * (temperature ** 2)

    ce_loss = CrossEntropyLoss()(student_logits, labels)
    return alpha * kl_loss + (1 - alpha) * ce_loss

In [10]:
# Custom compute_loss for Trainer
def compute_loss(model, inputs, return_outputs=False):
    labels = inputs["labels"].long()
    with torch.no_grad():
        teacher_outputs = teacher_cls(**inputs)

    student_outputs = model(**inputs)
    loss = compute_distill_loss(
        student_outputs.logits,
        teacher_outputs.logits,
        labels
    )

    return (loss, student_outputs) if return_outputs else loss

In [11]:
student_cls.compute_loss = compute_loss.__get__(student_cls)

In [12]:
from huggingface_hub import notebook_login

# This will prompt you to enter your token securely in the notebook
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [13]:
import evaluate
accuracy = evaluate.load("accuracy")
f1 = evaluate.load("f1")

Downloading builder script: 0.00B [00:00, ?B/s]

Downloading builder script: 0.00B [00:00, ?B/s]

In [14]:
from transformers import TrainingArguments, Trainer

args = TrainingArguments(
    output_dir="./MiniModernBERT-glue-mnli",
    per_device_train_batch_size=8,
    num_train_epochs=3,
    learning_rate=3e-5,
    fp16=True,
    logging_steps=500,
    report_to="none",
    push_to_hub=True,
    hub_model_id="kartikeyapandey20/MiniModernBERT-mnli",
)

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = logits.argmax(axis=-1)
    return {
        "accuracy": accuracy.compute(predictions=predictions, references=labels)["accuracy"],
        "f1": f1.compute(predictions=predictions, references=labels)["f1"]
    }

trainer = Trainer(
    model=student_cls,
    args=args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation_matched"],  # use "validation_mismatched" optionally for domain generalization
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

# -------------------
# Train & Evaluate
# -------------------
trainer.train()

  trainer = Trainer(
The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'eos_token_id': None, 'bos_token_id': None}.
W1024 12:02:56.410000 7948 torch/_inductor/utils.py:1436] [1/0_1] Not enough SMs to use max_autotune_gemm mode


Step,Training Loss
500,1.1105
1000,0.9508
1500,0.8743
2000,0.8288
2500,0.8415
3000,0.7971
3500,0.7861
4000,0.7716
4500,0.7612
5000,0.7686


SafetensorError: Error while serializing: I/O error: No space left on device (os error 28)

In [None]:
# Remove the custom loss before normal evaluation
del student_cls.compute_loss

In [None]:
trainer.evaluate()