In [1]:
# FILE: 01_baseline.py
import pandas as pd
import numpy as np
from datasets import load_dataset, Dataset
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, accuracy_score, f1_score
import joblib
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    AutoModelForSequenceClassification,
    BitsAndBytesConfig,
    TrainingArguments,
    Trainer,
    DataCollatorWithPadding,
    get_linear_schedule_with_warmup
)
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training, PeftModel
from accelerate import Accelerator
from tqdm import tqdm
import os
import time

print("="*80)
print("STEP 1: TRAINING BASELINE MODEL (TF-IDF + NAIVE BAYES)")
print("="*80)

# --- DATA LOADING ---
print("Loading IMDB dataset...")
dataset = load_dataset("imdb")
df_train = dataset['train'].to_pandas()
df_test = dataset['test'].to_pandas()
print(f"Loaded {len(df_train)} training samples and {len(df_test)} test samples.")

X_train, y_train = df_train['text'], df_train['label']
X_test, y_test = df_test['text'], df_test['label']

# --- MODEL TRAINING ---
baseline_model = Pipeline([
    ('tfidf', TfidfVectorizer(stop_words='english', max_features=20000, ngram_range=(1, 2))),
    ('classifier', MultinomialNB(alpha=0.1))
])

print("\nTraining the baseline model...")
baseline_model.fit(X_train, y_train)
print("Training complete.")

# --- EVALUATION ---
print("\nEvaluating baseline model on the test set...")
y_pred = baseline_model.predict(X_test)

print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=['NEGATIVE', 'POSITIVE']))

accuracy = baseline_model.score(X_test, y_test)
print(f"Test Accuracy: {accuracy:.4f}")

# --- SAVING THE MODEL ---
output_dir = "./models"
os.makedirs(output_dir, exist_ok=True)
model_path = os.path.join(output_dir, "baseline_model.joblib")
joblib.dump(baseline_model, model_path)
print(f"\n✅ Baseline model saved to: {model_path}")
print("="*80)

2025-12-02 15:08:07.545284: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1764688087.732526     109 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1764688087.786635     109 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

STEP 1: TRAINING BASELINE MODEL (TF-IDF + NAIVE BAYES)
Loading IMDB dataset...


README.md: 0.00B [00:00, ?B/s]

plain_text/train-00000-of-00001.parquet:   0%|          | 0.00/21.0M [00:00<?, ?B/s]

plain_text/test-00000-of-00001.parquet:   0%|          | 0.00/20.5M [00:00<?, ?B/s]

plain_text/unsupervised-00000-of-00001.p(…):   0%|          | 0.00/42.0M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating unsupervised split:   0%|          | 0/50000 [00:00<?, ? examples/s]

Loaded 25000 training samples and 25000 test samples.

Training the baseline model...
Training complete.

Evaluating baseline model on the test set...

Classification Report:
              precision    recall  f1-score   support

    NEGATIVE       0.83      0.87      0.85     12500
    POSITIVE       0.86      0.83      0.84     12500

    accuracy                           0.85     25000
   macro avg       0.85      0.85      0.85     25000
weighted avg       0.85      0.85      0.85     25000

Test Accuracy: 0.8470

✅ Baseline model saved to: ./models/baseline_model.joblib


In [2]:
#!pip install --upgrade transformers huggingface_hub accelerate peft bitsandbytes accelerate

In [3]:
# FILE: 02_finetune_roberta.py
print("=" * 80)
print("STEP 2: FINE-TUNING ROBERTA-LARGE TEACHER MODEL")
print("=" * 80)

# --- CONFIGURATION ---
class Config:
    MODEL_NAME = "roberta-large"
    MAX_LENGTH = 256
    RANDOM_SEED = 42
    BATCH_SIZE = 16
    GRADIENT_ACCUMULATION = 2
    NUM_EPOCHS = 2
    LEARNING_RATE = 2e-5
    OUTPUT_DIR = "./models/teacher_roberta"
    LOGITS_OUTPUT_PATH = "./data/roberta_logits.csv"

config = Config()
os.makedirs("./data", exist_ok=True)

# --- DATA PREPARATION ---
print("Loading and preparing IMDB dataset...")
dataset = load_dataset("imdb")
train_data = dataset['train']
# Use a subset of the test set for validation during training
test_val_split = dataset['test'].train_test_split(test_size=0.5, seed=config.RANDOM_SEED)
val_data = test_val_split['train']
test_data = test_val_split['test']
print(f"Train: {len(train_data)}, Validation: {len(val_data)}, Test: {len(test_data)}")

# --- MODEL AND TOKENIZER ---
print(f"Loading model and tokenizer: {config.MODEL_NAME}")
tokenizer = AutoTokenizer.from_pretrained(config.MODEL_NAME)
model = AutoModelForSequenceClassification.from_pretrained(config.MODEL_NAME, num_labels=2)

def tokenize_function(examples):
    return tokenizer(examples['text'], truncation=True, max_length=config.MAX_LENGTH)

train_dataset = train_data.map(tokenize_function, batched=True)
val_dataset = val_data.map(tokenize_function, batched=True)
test_dataset = test_data.map(tokenize_function, batched=True)

# --- TRAINING ---
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return {"accuracy": accuracy_score(labels, predictions)}

training_args = TrainingArguments(
    output_dir=config.OUTPUT_DIR,
    num_train_epochs=config.NUM_EPOCHS,
    per_device_train_batch_size=config.BATCH_SIZE,
    per_device_eval_batch_size=config.BATCH_SIZE,
    gradient_accumulation_steps=config.GRADIENT_ACCUMULATION,
    learning_rate=config.LEARNING_RATE,
    eval_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=1,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    fp16=True,
    report_to="none",
    seed=config.RANDOM_SEED,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
    data_collator=DataCollatorWithPadding(tokenizer),
)

print("Starting RoBERTa fine-tuning...")
trainer.train()
print("Fine-tuning complete.")

# --- EVALUATION AND SAVING ---
if trainer.is_world_process_zero():
    print("\nEvaluating on test set...")
    test_results = trainer.evaluate(test_dataset)
    print(f"  Test Accuracy: {test_results['eval_accuracy']:.4f}")

    print("\nSaving model...")
    trainer.save_model(config.OUTPUT_DIR)
    tokenizer.save_pretrained(config.OUTPUT_DIR)
    print(f"  ✓ Model saved to: {config.OUTPUT_DIR}")

    print("\nGenerating and saving soft labels for distillation...")
    train_preds = trainer.predict(train_dataset)
    
    df_logits = pd.DataFrame({
        'label': train_preds.label_ids,
        'logit_0': train_preds.predictions[:, 0],
        'logit_1': train_preds.predictions[:, 1],
    })
    df_logits.to_csv(config.LOGITS_OUTPUT_PATH, index=False)
    print(f"  ✓ Soft labels saved to: {config.LOGITS_OUTPUT_PATH}")

print("\n✅ RoBERTa teacher preparation complete!")
print("="*80)

STEP 2: FINE-TUNING ROBERTA-LARGE TEACHER MODEL
Loading and preparing IMDB dataset...
Train: 25000, Validation: 12500, Test: 12500
Loading model and tokenizer: roberta-large


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-large and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Map:   0%|          | 0/12500 [00:00<?, ? examples/s]

Map:   0%|          | 0/12500 [00:00<?, ? examples/s]

Starting RoBERTa fine-tuning...




Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.143336,0.94392
2,0.181200,0.142743,0.95208




Fine-tuning complete.

Evaluating on test set...




  Test Accuracy: 0.9536

Saving model...
  ✓ Model saved to: ./models/teacher_roberta

Generating and saving soft labels for distillation...




  ✓ Soft labels saved to: ./data/roberta_logits.csv

✅ RoBERTa teacher preparation complete!


In [4]:
# FILE: 3_finetuning_mistral.py
import torch
import gc
import os
from datasets import load_dataset
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training, TaskType
from transformers import (
    AutoTokenizer, 
    AutoModelForCausalLM, 
    BitsAndBytesConfig, 
    TrainingArguments,
    Trainer,
    DataCollatorForLanguageModeling
)

# --- MEMORY CLEANUP START ---
gc.collect()
torch.cuda.empty_cache()
# ----------------------------

print("=" * 80)
print("STEP 3: FINE-TUNING MISTRAL (MEMORY OPTIMIZED)")
print("=" * 80)

class Config:
    MODEL_NAME = "mistralai/Mistral-7B-v0.1"
    TRAIN_SIZE = 1000   # Reduced slightly to ensure for Kaggle limits
    VAL_SIZE = 200
    RANDOM_SEED = 42
    LORA_R = 16
    LORA_ALPHA = 32
    
    # --- MEMORY OPTIMIZATIONS ---
    BATCH_SIZE = 1          # Process 1 sample at a time
    GRAD_ACC = 8           # Accumulate 16 steps to simulate Batch Size 16
    MAX_SEQ_LENGTH = 300    # Reduced to save VRAM
    LEARNING_RATE = 2e-4
    OUTPUT_DIR = "./models/teacher_mistral_adapters"

config = Config()

# --- 1. DATA ---
print("Loading and formatting data...")
dataset = load_dataset("imdb")
train_data = dataset['train'].shuffle(seed=config.RANDOM_SEED).select(range(config.TRAIN_SIZE))
val_data = dataset['test'].shuffle(seed=config.RANDOM_SEED).select(range(config.VAL_SIZE))

tokenizer = AutoTokenizer.from_pretrained(config.MODEL_NAME)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right" 

def format_and_tokenize(example):
    # Truncate raw text to fit
    text_clean = example['text'][:1000] 
    label = "positive" if example['label'] == 1 else "negative"
    
    full_text = (
        f"[INST] Sentiment Analysis. Return 'positive' or 'negative'.\n"
        f"Review: {text_clean} [/INST] \n"
        f"Sentiment: {label}"
    )
    
    tokenized = tokenizer(
        full_text,
        truncation=True,
        max_length=config.MAX_SEQ_LENGTH,
        padding="max_length"
    )
    tokenized["labels"] = tokenized["input_ids"].copy()
    return tokenized

train_dataset = train_data.map(format_and_tokenize)
val_dataset = val_data.map(format_and_tokenize)

# --- 2. MODEL (4-bit + Gradient Checkpointing) ---
print(f"Loading base model: {config.MODEL_NAME}")
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True,
)

model = AutoModelForCausalLM.from_pretrained(
    config.MODEL_NAME,
    quantization_config=bnb_config,
    device_map="auto", # Let Accelerate handle placement
    use_cache=False
)

# ENABLE GRADIENT CHECKPOINTING
model.gradient_checkpointing_enable()
model = prepare_model_for_kbit_training(model)

# --- 3. LORA ---
peft_config = LoraConfig(
    r=config.LORA_R,
    lora_alpha=config.LORA_ALPHA,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],
    task_type=TaskType.CAUSAL_LM,
    bias="none",
    lora_dropout=0.05,
)

model = get_peft_model(model, peft_config)
model.print_trainable_parameters()

# --- 4. TRAINING ---
training_args = TrainingArguments(
    output_dir=config.OUTPUT_DIR,
    num_train_epochs=1,
    per_device_train_batch_size=config.BATCH_SIZE,
    gradient_accumulation_steps=config.GRAD_ACC,
    learning_rate=config.LEARNING_RATE,
    logging_steps=10,
    fp16=True,
    optim="paged_adamw_8bit", # Saves optimizer memory
    save_strategy="no",
    report_to="none",
    ddp_find_unused_parameters=False,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    data_collator=DataCollatorForLanguageModeling(tokenizer, mlm=False),
)

print("Starting Memory-Optimized Fine-Tuning...")
trainer.train()

# --- 5. SAVING ---
print(f"Saving adapters to {config.OUTPUT_DIR}...")
trainer.model.save_pretrained(config.OUTPUT_DIR)
tokenizer.save_pretrained(config.OUTPUT_DIR)

# CLEANUP FOR NEXT STEPS
del model, trainer
gc.collect()
torch.cuda.empty_cache()
print("✅ Mistral fine-tuning complete & Memory cleared.")

STEP 3: FINE-TUNING MISTRAL (MEMORY OPTIMIZED)
Loading and formatting data...


tokenizer_config.json:   0%|          | 0.00/996 [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Map:   0%|          | 0/200 [00:00<?, ? examples/s]

Loading base model: mistralai/Mistral-7B-v0.1


config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/4.54G [00:00<?, ?B/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.94G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

trainable params: 13,631,488 || all params: 7,255,363,584 || trainable%: 0.1879
Starting Memory-Optimized Fine-Tuning...


  return fn(*args, **kwargs)


Step,Training Loss
10,2.3537
20,2.1628
30,2.1356
40,2.2195
50,2.1669
60,2.1375
70,2.1451
80,2.1053
90,2.2256
100,2.1664


Saving adapters to ./models/teacher_mistral_adapters...
✅ Mistral fine-tuning complete & Memory cleared.


In [5]:
# FILE: 04a_distill_from_roberta.py
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader
from transformers import (
    AutoTokenizer, 
    AutoModelForSequenceClassification, 
    DataCollatorWithPadding,
    get_linear_schedule_with_warmup
)
from datasets import load_dataset
from accelerate import Accelerator
from tqdm.auto import tqdm
import os

print("=" * 80)
print("STEP 4a: DISTILLATION (ROBERTA -> DISTILBERT)")
print("LOGITS + FEATURES + ATTENTION")
print("=" * 80)

# --- CONFIGURATION ---
class Config:
    TEACHER_PATH = "./models/teacher_roberta"
    STUDENT_NAME = "distilbert-base-uncased"
    OUTPUT_DIR = "./models/student_distilled_from_roberta"
    MAX_LENGTH = 256
    BATCH_SIZE = 32
    NUM_EPOCHS = 3
    LEARNING_RATE = 5e-5
    
    # Distillation Hyperparameters
    TEMP = 2.0           # Temperature for softening logits
    ALPHA_CE = 0.5       # Weight for Hard Labels (Ground Truth)
    ALPHA_KD = 0.5       # Weight for Soft Labels (Teacher Logits)
    ALPHA_FEAT = 0.3     # Weight for Hidden State matching
    
    # Layer Mapping: DistilBERT(6 layers) -> RoBERTa(24 layers)
    # Map every student layer to every 4th teacher layer
    # Format: {student_layer_index: teacher_layer_index}
    LAYER_MAPPING = {0: 3, 1: 7, 2: 11, 3: 15, 4: 19, 5: 23}

config = Config()

# --- 1. DATA PREPARATION ---
print("Loading and tokenizing data...")
dataset = load_dataset("imdb")
tokenizer = AutoTokenizer.from_pretrained(config.STUDENT_NAME)

def tokenize_function(examples):
    return tokenizer(examples['text'], truncation=True, max_length=config.MAX_LENGTH)

tokenized_datasets = dataset.map(tokenize_function, batched=True, remove_columns=["text"])
tokenized_datasets.set_format("torch")

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

train_dataloader = DataLoader(
    tokenized_datasets["train"], shuffle=True, batch_size=config.BATCH_SIZE, collate_fn=data_collator
)
eval_dataloader = DataLoader(
    tokenized_datasets["test"], batch_size=config.BATCH_SIZE, collate_fn=data_collator
)

# --- 2. MODEL DEFINITION ---
print("Loading models...")

# Teacher: RoBERTa Large (Frozen)
teacher_model = AutoModelForSequenceClassification.from_pretrained(
    config.TEACHER_PATH,
    num_labels=2,
    output_hidden_states=True,
    output_attentions=True
)
teacher_model.eval()
for param in teacher_model.parameters():
    param.requires_grad = False

# Student: DistilBERT (Trainable)
# We wrap it to handle the projection layer for feature matching
class DistillableStudent(nn.Module):
    def __init__(self, student_name, teacher_hidden_size):
        super().__init__()
        self.student = AutoModelForSequenceClassification.from_pretrained(
            student_name, 
            num_labels=2, 
            output_hidden_states=True, 
            output_attentions=True
        )
        self.student_hidden_size = self.student.config.hidden_size
        
        # Projection Layer: Maps Student (768) -> Teacher (1024)
        # Create one projection layer per mapped layer to allow specific adaptation
        self.projections = nn.ModuleList([
            nn.Linear(self.student_hidden_size, teacher_hidden_size)
            for _ in range(len(config.LAYER_MAPPING))
        ])

    def forward(self, input_ids, attention_mask, labels=None):
        return self.student(input_ids=input_ids, attention_mask=attention_mask, labels=labels)

student_wrapper = DistillableStudent(config.STUDENT_NAME, teacher_model.config.hidden_size)

# --- 3. TRAINING SETUP (ACCELERATOR) ---
accelerator = Accelerator()
optimizer = torch.optim.AdamW(student_wrapper.parameters(), lr=config.LEARNING_RATE)

num_training_steps = config.NUM_EPOCHS * len(train_dataloader)
lr_scheduler = get_linear_schedule_with_warmup(
    optimizer, num_warmup_steps=100, num_training_steps=num_training_steps
)

# Prepare everything with Accelerator
student_wrapper, teacher_model, optimizer, train_dataloader, eval_dataloader, lr_scheduler = accelerator.prepare(
    student_wrapper, teacher_model, optimizer, train_dataloader, eval_dataloader, lr_scheduler
)

# --- 4. LOSS FUNCTION ---
def compute_distillation_loss(student_outputs, teacher_outputs, batch, projections):
    # 1. Hard Label Loss (Cross Entropy with Ground Truth)
    loss_ce = student_outputs.loss
    
    # 2. Soft Label Loss (KL Divergence with Teacher Logits)
    loss_kd = F.kl_div(
        F.log_softmax(student_outputs.logits / config.TEMP, dim=-1),
        F.softmax(teacher_outputs.logits / config.TEMP, dim=-1),
        reduction="batchmean"
    ) * (config.TEMP ** 2)
    
    # 3. Feature Loss (Hidden State Matching)
    # Iterate through our mapping: Student Layer i -> Teacher Layer j
    loss_feat = 0
    # hidden_states[0] is embeddings, Start from index 1 for layers
    s_hiddens = student_outputs.hidden_states[1:] 
    t_hiddens = teacher_outputs.hidden_states[1:]
    
    for idx, (s_layer_idx, t_layer_idx) in enumerate(config.LAYER_MAPPING.items()):
        # Project Student (Batch, Seq, 768) -> (Batch, Seq, 1024)
        s_feat = projections[idx](s_hiddens[s_layer_idx])
        t_feat = t_hiddens[t_layer_idx]
        loss_feat += F.mse_loss(s_feat, t_feat)
    
    loss_feat = loss_feat / len(config.LAYER_MAPPING)

    # Total Loss
    return (config.ALPHA_CE * loss_ce) + (config.ALPHA_KD * loss_kd) + (config.ALPHA_FEAT * loss_feat)

# --- 5. TRAINING LOOP ---
print("Starting distillation...")

for epoch in range(config.NUM_EPOCHS):
    student_wrapper.train()
    total_loss = 0
    
    progress_bar = tqdm(train_dataloader, desc=f"Epoch {epoch+1}", disable=not accelerator.is_local_main_process)
    
    for batch in progress_bar:
        # Forward Pass Teacher (No Grad)
        with torch.no_grad():
            teacher_outputs = teacher_model(
                input_ids=batch["input_ids"],
                attention_mask=batch["attention_mask"]
            )
        
        # Forward Pass Student
        student_outputs = student_wrapper(
            input_ids=batch["input_ids"],
            attention_mask=batch["attention_mask"],
            labels=batch["labels"]
        )
        
        # Calculate Loss
        loss = compute_distillation_loss(
            student_outputs, 
            teacher_outputs, 
            batch, 
            student_wrapper.projections
        )
        
        # Backward
        optimizer.zero_grad()
        accelerator.backward(loss)
        optimizer.step()
        lr_scheduler.step()
        
        total_loss += loss.item()
        progress_bar.set_postfix({"loss": loss.item()})
    
    avg_train_loss = total_loss / len(train_dataloader)
    if accelerator.is_main_process:
        print(f"Epoch {epoch+1} | Average Train Loss: {avg_train_loss:.4f}")

    # --- EVALUATION ---
    student_wrapper.eval()
    correct = 0
    total = 0
    
    for batch in eval_dataloader:
        with torch.no_grad():
            outputs = student_wrapper(
                input_ids=batch["input_ids"],
                attention_mask=batch["attention_mask"]
            )
        predictions = torch.argmax(outputs.logits, dim=-1)
        correct += (predictions == batch["labels"]).sum().item()
        total += batch["labels"].size(0)
    
    acc = correct / total
    if accelerator.is_main_process:
        print(f"Epoch {epoch+1} | Validation Accuracy: {acc:.4f}")

# --- 6. SAVING ---
if accelerator.is_main_process:
    print(f"\nSaving distilled model to {config.OUTPUT_DIR}...")
    os.makedirs(config.OUTPUT_DIR, exist_ok=True)
    
    # We unwrap the model to save just the DistilBERT part, not the projections
    # (Projections are only needed for training)
    unwrapped_wrapper = accelerator.unwrap_model(student_wrapper)
    unwrapped_wrapper.student.save_pretrained(config.OUTPUT_DIR)
    tokenizer.save_pretrained(config.OUTPUT_DIR)
    print("✅ Distillation complete.")

STEP 4a: DISTILLATION (ROBERTA -> DISTILBERT)
LOGITS + FEATURES + ATTENTION
Loading and tokenizing data...


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Map:   0%|          | 0/50000 [00:00<?, ? examples/s]

Loading models...


model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Starting distillation...


Epoch 1:   0%|          | 0/782 [00:00<?, ?it/s]

Epoch 1 | Average Train Loss: 0.5200
Epoch 1 | Validation Accuracy: 0.8392


Epoch 2:   0%|          | 0/782 [00:00<?, ?it/s]

Epoch 2 | Average Train Loss: 0.4146
Epoch 2 | Validation Accuracy: 0.8075


Epoch 3:   0%|          | 0/782 [00:00<?, ?it/s]

Epoch 3 | Average Train Loss: 0.3921
Epoch 3 | Validation Accuracy: 0.8748

Saving distilled model to ./models/student_distilled_from_roberta...
✅ Distillation complete.


In [6]:
# FILE: 04b_distill_from_mistral.py

import torch
import torch.nn.functional as F
from transformers import AutoTokenizer, AutoModelForCausalLM, AutoModelForSequenceClassification, TrainingArguments, Trainer
from peft import PeftModel, PeftConfig
from datasets import load_dataset, Dataset
import pandas as pd
import numpy as np
from tqdm import tqdm

print("=" * 80)
print("STEP 4b: DISTILLING MISTRAL (GENERATIVE) -> DISTILBERT (CLASSIFIER)")
print("=" * 80)

# --- CONFIG ---
MISTRAL_ADAPTER_PATH = "./models/teacher_mistral_adapters"
STUDENT_NAME = "distilbert-base-uncased"
OUTPUT_DIR = "./models/student_distilled_from_mistral"
CACHE_FILE = "mistral_soft_labels.csv"

# --- PART 1: GENERATE SOFT LABELS WITH MISTRAL ---
if not os.path.exists(CACHE_FILE):
    print("Generating soft labels from Mistral (this takes time)...")
    
    # Load Base + Adapter
    config = PeftConfig.from_pretrained(MISTRAL_ADAPTER_PATH)
    base_model = AutoModelForCausalLM.from_pretrained(
        config.base_model_name_or_path,
        load_in_4bit=True,
        device_map="auto"
    )
    teacher_model = PeftModel.from_pretrained(base_model, MISTRAL_ADAPTER_PATH)
    teacher_tokenizer = AutoTokenizer.from_pretrained(config.base_model_name_or_path)
    teacher_tokenizer.pad_token = teacher_tokenizer.eos_token
    
    # Identify token IDs for "positive" and "negative"
    # Note: Tokenization depends on leading spaces. 
    pos_id = teacher_tokenizer.encode("positive", add_special_tokens=False)[0]
    neg_id = teacher_tokenizer.encode("negative", add_special_tokens=False)[0]
    print(f"Token IDs - Positive: {pos_id}, Negative: {neg_id}")

    dataset = load_dataset("imdb", split="train[:2000]") # Subset for demo speed
    
    results = []
    
    for item in tqdm(dataset):
        text = item['text'][:1000] # Truncate for speed
        prompt = f"[INST] Analyze the sentiment. Return 'positive' or 'negative'.\nReview: {text} [/INST] \nSentiment:"
        
        inputs = teacher_tokenizer(prompt, return_tensors="pt").to("cuda")
        
        with torch.no_grad():
            outputs = teacher_model(**inputs)
            # Get logits of the last token
            last_token_logits = outputs.logits[0, -1, :]
            
            # Extract logits for specific target words
            score_pos = last_token_logits[pos_id].item()
            score_neg = last_token_logits[neg_id].item()
            
        results.append({
            "text": item['text'],
            "gt_label": item['label'],
            "teacher_logit_pos": score_pos,
            "teacher_logit_neg": score_neg
        })
        
    df = pd.DataFrame(results)
    df.to_csv(CACHE_FILE, index=False)
    # Clean up GPU
    del teacher_model, base_model
    torch.cuda.empty_cache()
else:
    print(f"Loading cached soft labels from {CACHE_FILE}")
    df = pd.read_csv(CACHE_FILE)

# --- PART 2: TRAIN STUDENT (DISTILBERT) ---
print("Training Student on Mistral's Soft Labels...")

# Convert logits to probabilities (Softmax)
# [neg_logit, pos_logit] as the two class logits
teacher_logits = df[['teacher_logit_neg', 'teacher_logit_pos']].values
probs = torch.nn.functional.softmax(torch.tensor(teacher_logits, dtype=torch.float32), dim=1)

# Create dataset
train_ds = Dataset.from_dict({
    "text": df['text'].tolist(),
    "label": df['gt_label'].tolist(),
    "teacher_probs": probs.tolist()
})

tokenizer = AutoTokenizer.from_pretrained(STUDENT_NAME)
def tokenize(batch):
    return tokenizer(batch['text'], padding="max_length", truncation=True, max_length=256)

train_ds = train_ds.map(tokenize, batched=True)

# Custom Trainer to handle Soft Labels
class DistillationTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False, **kwargs): # Fixed signature
        labels = inputs.get("labels")
        teacher_probs = inputs.pop("teacher_probs") # Extract custom column
        
        outputs = model(**inputs)
        student_logits = outputs.logits
        
        # KL Divergence Loss
        # Student LogSoftmax vs Teacher Probs
        loss_kd = F.kl_div(
            F.log_softmax(student_logits, dim=-1),
            teacher_probs,
            reduction="batchmean"
        )
        
        # Standard Cross Entropy with Ground Truth
        loss_ce = F.cross_entropy(student_logits, labels)
        
        # Weighted Sum
        loss = 0.5 * loss_kd + 0.5 * loss_ce
        
        return (loss, outputs) if return_outputs else loss

student_model = AutoModelForSequenceClassification.from_pretrained(STUDENT_NAME, num_labels=2)

training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    num_train_epochs=3,
    per_device_train_batch_size=16,
    learning_rate=2e-5,
    remove_unused_columns=False, # For'teacher_probs' not be dropped
    report_to="none"
)

trainer = DistillationTrainer(
    model=student_model,
    args=training_args,
    train_dataset=train_ds,
)

trainer.train()
trainer.save_model(OUTPUT_DIR)
tokenizer.save_pretrained(OUTPUT_DIR)
print("✅ Mistral -> DistilBERT Distillation Complete.")

STEP 4b: DISTILLING MISTRAL (GENERATIVE) -> DISTILBERT (CLASSIFIER)
Generating soft labels from Mistral (this takes time)...


The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Token IDs - Positive: 5278, Negative: 7087


100%|██████████| 2000/2000 [1:03:29<00:00,  1.90s/it]


Training Student on Mistral's Soft Labels...


Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss


✅ Mistral -> DistilBERT Distillation Complete.


In [7]:
# FILE: 05_comparison.py
import pandas as pd
import numpy as np
import torch
import time
import os
import gc
import joblib
from tqdm import tqdm
from datasets import load_dataset
from sklearn.metrics import accuracy_score
from transformers import (
    AutoTokenizer, 
    AutoModelForSequenceClassification, 
    AutoModelForCausalLM, 
    BitsAndBytesConfig
)
from peft import PeftModel, PeftConfig

print("=" * 80)
print("STEP 5: COMPREHENSIVE BENCHMARK (TEACHERS VS STUDENTS)")
print("=" * 80)

# --- CONFIGURATION ---
class BenchConfig:
    # Paths
    PATH_BASELINE = "./models/baseline_model.joblib"
    PATH_TEACHER_ROBERTA = "./models/teacher_roberta"
    PATH_TEACHER_MISTRAL = "./models/teacher_mistral_adapters"
    PATH_STUDENT_ROBERTA = "./models/student_distilled_from_roberta"
    PATH_STUDENT_MISTRAL = "./models/student_distilled_from_mistral"
    
    # Settings
    TEST_SUBSET_SIZE = 500  # Keep small for Kaggle execution time
    LATENCY_SAMPLES = 50    # Samples to measure single-item latency
    BATCH_SIZE = 16         # For throughput measurement
    DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

config = BenchConfig()

# --- UTILS ---
def get_model_size_mb(path):
    """Calculates directory size in MB"""
    if os.path.isfile(path):
        return os.path.getsize(path) / (1024 * 1024)
    total_size = 0
    for dirpath, _, filenames in os.walk(path):
        for f in filenames:
            fp = os.path.join(dirpath, f)
            total_size += os.path.getsize(fp)
    return total_size / (1024 * 1024)

def clean_memory():
    """Aggressively clears GPU memory"""
    gc.collect()
    torch.cuda.empty_cache()

# --- DATA LOADING ---
print("Loading Test Data...")
dataset = load_dataset("imdb", split="test")
# Shuffle and select subset to save time
dataset = dataset.shuffle(seed=42).select(range(config.TEST_SUBSET_SIZE))
texts = dataset['text']
labels = dataset['label']

results = []

# ==========================================
# 1. BENCHMARK BASELINE
# ==========================================
try:
    print("\n--- Benchmarking Baseline (TF-IDF) ---")
    model = joblib.load(config.PATH_BASELINE)
    
    # Latency (CPU)
    start = time.time()
    for i in range(config.LATENCY_SAMPLES):
        _ = model.predict([texts[i]])
    avg_latency = ((time.time() - start) / config.LATENCY_SAMPLES) * 1000
    
    # Throughput
    start = time.time()
    preds = model.predict(texts)
    total_time = time.time() - start
    throughput = len(texts) / total_time
    
    # Accuracy
    acc = accuracy_score(labels, preds)
    
    results.append({
        "Model": "Baseline (TF-IDF)",
        "Type": "Traditional",
        "Params (M)": 0, # Negligible
        "Size (MB)": get_model_size_mb(config.PATH_BASELINE),
        "Accuracy": acc,
        "Latency (ms)": avg_latency,
        "Throughput (samp/s)": throughput
    })
    del model
except Exception as e:
    print(f"Skipped Baseline: {e}")

# ==========================================
# 2. BENCHMARK TRANSFORMER CLASSIFIERS
# (RoBERTa Teacher, DistilBERT Students)
# ==========================================
def benchmark_classifier(model_path, model_name, model_type):
    print(f"\n--- Benchmarking {model_name} ---")
    clean_memory()
    
    try:
        tokenizer = AutoTokenizer.from_pretrained(model_path)
        model = AutoModelForSequenceClassification.from_pretrained(model_path)
        model.to(config.DEVICE).eval()
        
        # Calculate Params
        num_params = sum(p.numel() for p in model.parameters()) / 1_000_000
        
        # 1. Latency (Batch size 1)
        latencies = []
        for i in range(config.LATENCY_SAMPLES):
            inputs = tokenizer(texts[i], return_tensors="pt", truncation=True, max_length=256).to(config.DEVICE)
            start = time.time()
            with torch.no_grad():
                _ = model(**inputs)
            latencies.append((time.time() - start) * 1000)
        avg_latency = np.mean(latencies)
        
        # 2. Accuracy & Throughput (Batched)
        all_preds = []
        start_time = time.time()
        
        for i in range(0, len(texts), config.BATCH_SIZE):
            batch_texts = texts[i : i + config.BATCH_SIZE]
            inputs = tokenizer(batch_texts, return_tensors="pt", padding=True, truncation=True, max_length=256).to(config.DEVICE)
            
            with torch.no_grad():
                outputs = model(**inputs)
                preds = torch.argmax(outputs.logits, dim=-1).cpu().numpy()
                all_preds.extend(preds)
                
        total_time = time.time() - start_time
        throughput = len(texts) / total_time
        acc = accuracy_score(labels, all_preds)
        
        results.append({
            "Model": model_name,
            "Type": model_type,
            "Params (M)": num_params,
            "Size (MB)": get_model_size_mb(model_path),
            "Accuracy": acc,
            "Latency (ms)": avg_latency,
            "Throughput (samp/s)": throughput
        })
        
        del model, tokenizer
        clean_memory()
        
    except Exception as e:
        print(f"Failed to benchmark {model_name}: {e}")

# Run for Classifiers
benchmark_classifier(config.PATH_TEACHER_ROBERTA, "Teacher (RoBERTa-L)", "Teacher")
benchmark_classifier(config.PATH_STUDENT_ROBERTA, "Student (from RoBERTa)", "Student")
benchmark_classifier(config.PATH_STUDENT_MISTRAL, "Student (from Mistral)", "Student")

# ==========================================
# 3. BENCHMARK MISTRAL (GENERATIVE)
# ==========================================
print(f"\n--- Benchmarking Teacher (Mistral-7B) ---")
clean_memory()

try:
    # Load Adapter Config
    peft_config = PeftConfig.from_pretrained(config.PATH_TEACHER_MISTRAL)
    
    # Load Base Model (4-bit)
    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.float16
    )
    
    base_model = AutoModelForCausalLM.from_pretrained(
        peft_config.base_model_name_or_path,
        quantization_config=bnb_config,
        device_map="auto"
    )
    
    # Load Adapter
    model = PeftModel.from_pretrained(base_model, config.PATH_TEACHER_MISTRAL)
    tokenizer = AutoTokenizer.from_pretrained(peft_config.base_model_name_or_path)
    tokenizer.pad_token = tokenizer.eos_token
    
    # Params (Base + Adapter)
    num_params = sum(p.numel() for p in model.parameters()) / 1_000_000 # This counts all params (even frozen)
    
    # Helper for Prompting
    def format_prompt(text):
        return f"[INST] Analyze the sentiment. Return 'positive' or 'negative'.\nReview: {text[:1000]} [/INST] \nSentiment:"

    # 1. Latency (Generation is slow, do fewer samples)
    latencies = []
    for i in range(10): # Only 10 samples for Mistral Latency to save time
        prompt = format_prompt(texts[i])
        inputs = tokenizer(prompt, return_tensors="pt").to(config.DEVICE)
        
        start = time.time()
        with torch.no_grad():
            _ = model.generate(**inputs, max_new_tokens=2, pad_token_id=tokenizer.eos_token_id)
        latencies.append((time.time() - start) * 1000)
    avg_latency = np.mean(latencies)

    # 2. Accuracy & Throughput
    # Batch size 1 for simplicity and correctness in this script, 
    # acknowledging this penalizes Mistral's throughput slightly.
    
    correct = 0
    start_time = time.time()
    
    # Limit Mistral evaluation to 100 samples
    eval_limit = 100 
    print(f"  (Evaluating Mistral on first {eval_limit} samples only...)")
    
    for i in tqdm(range(eval_limit)):
        prompt = format_prompt(texts[i])
        inputs = tokenizer(prompt, return_tensors="pt").to(config.DEVICE)
        
        with torch.no_grad():
            outputs = model.generate(**inputs, max_new_tokens=5, pad_token_id=tokenizer.eos_token_id)
        
        generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True).lower()
        
        # Simple parsing
        prediction = 1 if "positive" in generated_text.split("sentiment:")[-1] else 0
        if prediction == labels[i]:
            correct += 1
            
    total_time = time.time() - start_time
    throughput = eval_limit / total_time
    acc = correct / eval_limit
    
    results.append({
        "Model": "Teacher (Mistral-7B)",
        "Type": "Teacher",
        "Params (M)": 7000, # Approx
        "Size (MB)": 15000, # Approx 4-bit size
        "Accuracy": acc,
        "Latency (ms)": avg_latency,
        "Throughput (samp/s)": throughput
    })
    
    del model, base_model, tokenizer
    clean_memory()

except Exception as e:
    print(f"Failed to benchmark Mistral: {e}")

# ==========================================
# 4. FINAL REPORT
# ==========================================
print("\n" + "="*80)
print("FINAL BENCHMARK REPORT")
print("="*80)

df = pd.DataFrame(results)
df = df.set_index("Model")

# Calculate Improvement Metrics
try:
    rob_lat = df.loc["Teacher (RoBERTa-L)", "Latency (ms)"]
    rob_size = df.loc["Teacher (RoBERTa-L)", "Size (MB)"]
    
    df["Speedup (x)"] = rob_lat / df["Latency (ms)"]
    df["Size Reduction (x)"] = rob_size / df["Size (MB)"]
except KeyError:
    pass

# Reorder columns
cols = ["Type", "Accuracy", "Latency (ms)", "Throughput (samp/s)", "Speedup (x)", "Size (MB)", "Params (M)"]
df = df[cols]

print(df.to_string(float_format="%.2f"))

print("\n--- Analysis ---")
print("1. Latency: Lower is better. Critical for real-time APIs.")
print("2. Throughput: Higher is better. Critical for offline batch processing.")
print("3. Mistral Note: Generative models are significantly slower than classifiers because")
print("   they generate token-by-token. Distilling Mistral -> DistilBERT unlocks massive speedups.")
print("="*80)

STEP 5: COMPREHENSIVE BENCHMARK (TEACHERS VS STUDENTS)
Loading Test Data...

--- Benchmarking Baseline (TF-IDF) ---

--- Benchmarking Teacher (RoBERTa-L) ---

--- Benchmarking Student (from RoBERTa) ---





--- Benchmarking Student (from Mistral) ---

--- Benchmarking Teacher (Mistral-7B) ---


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

  (Evaluating Mistral on first 100 samples only...)


100%|██████████| 100/100 [01:26<00:00,  1.15it/s]



FINAL BENCHMARK REPORT
                               Type  Accuracy  Latency (ms)  Throughput (samp/s)  Speedup (x)  Size (MB)  Params (M)
Model                                                                                                               
Baseline (TF-IDF)       Traditional      0.84          1.33              4135.32        12.84      28.80        0.00
Teacher (RoBERTa-L)         Teacher      0.96         17.11                18.41         1.00    5432.01      355.36
Student (from RoBERTa)      Student      0.87          5.91               103.00         2.89     256.33       66.96
Student (from Mistral)      Student      0.51          4.07               115.05         4.21    1022.69       66.96
Teacher (Mistral-7B)        Teacher      0.95        522.36                 1.15         0.03   15000.00     7000.00

--- Analysis ---
1. Latency: Lower is better. Critical for real-time APIs.
2. Throughput: Higher is better. Critical for offline batch processing.
3. Mistra