In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Knowledge Distillation
**Distillation learning** is a technique where a smaller, simpler model (called the student) is trained to mimic the behavior of a larger, more complex model (called the teacher). 
The goal is to **transfer the knowledge from the teacher model to the student model**, 
enabling the **student** to achieve **similar performance** while being **more efficient in terms of size, speed, and resource usage**.

## Kullback-Leibler divergence (KL divergence) 
* KL divergence is the divergence between the probability distributions predicted by the teacher model and the student model. 
* The teacher's logits are scaled by T\*T where T\*T is the temperature of the softmax function. 
* A higher temperature smooths the probability distributions, making them easier for the student to learn from.

## Knowledge Distillation Step-by-Step
1. **Train the Teacher**
Train a large, complex teacher model (like BERT) on your dataset using its standard loss function (e.g., cross-entropy loss for classification tasks).
2. **Generate Teacher Predictions**
Use the teacher model to generate predictions for your training data. These predictions will be used as a target for the student model.
3. **Train Student Model** 
Initialize a smaller student model (like DistilBERT). 
Define the loss function combining cross-entropy loss, knowledge distillation loss, and optionally, cosine similarity loss.
4. **Compute Losses** 
combine these losses into a single loss function for training the student model:
$$L_{\text{student}} = \alpha L_{\text{CE}} + (1 - \alpha) L_{\text{KD}}$$
5. **Train Student Model** 
Use the combined loss function to train the student model on your dataset.


**Note** More details regarding knowledge distillation can be found from the link provided below.
This notebook was also inspired by that given in this link https://github.com/nlp-with-transformers/notebooks/blob/main/08_model-compression.ipynb.

* Here model compression is applied on roberta model

In [None]:
import shutil
dir_path = "/kaggle/working/"
# Delete the directory and all its contents
try:
    shutil.rmtree(dir_path)
    print(f"{dir_path} has been deleted.")
except Exception as e:
    print(f"Failed to delete {dir_path}. Reason: {e}")

In [None]:
import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

# Load and Explore The Data

In [None]:
from datasets import load_dataset

clinc_ds = load_dataset("clinc_oos", "plus")
clinc_ds

In [None]:
# check number of classes in training data labels

intents = clinc_ds['train'].features['intent']
intents

In [None]:
import random
# RANDOM_SEED = 42
# random.seed(RANDOM_SEED)

rand_idx = random.randint(0, len(clinc_ds['train']))
sample_example = clinc_ds['train'][rand_idx]

print(f'sample example: {sample_example}')
print(f'intent converted to str: {intents.int2str(sample_example["intent"])}')

In [None]:
rand_idx = random.randint(0, len(clinc_ds['test']))
sample_example_test = clinc_ds['test'][rand_idx]

print(f'sample example: {sample_example_test}')
print(f'intent converted to str: {intents.int2str(sample_example_test["intent"])}')

# Transformer Classification Pipeline

## Make predictions with Transformer pipeline

In [None]:
import transformers
from transformers import pipeline

# baseline_model_ckpt = 'optimum/roberta-large-finetuned-clinc'
# baseline_model_ckpt = 'transformersbook/bert-base-uncased-finetuned-clinc'
baseline_model_ckpt = 'optimum/roberta-large-finetuned-clinc'
baseline_model_name = 'roberta-large-finetuned-clinc'
pipe = pipeline('text-classification', baseline_model_ckpt, device=device)
pipe_out = pipe(sample_example['text'])
pipe_out

In [None]:
pipe_out = pipe(sample_example_test['text'])
pipe_out

In [None]:
# check current working directory 

import os

current_directory = os.getcwd()
print("Current Working Directory:", current_directory)

# Install evaluate

* **evaluate** offers a wide range of **pre-built evaluation metrics commonly used in NLP**, including accuracy, F1 score, BLEU, ROUGE, and more.

In [None]:
# try:
#     import evaluate
#     print("evaluate is already installed.")
# except ImportError:
#     !pip install evaluate
#     import evaluate
#     print("evaluate has been installed and imported.")

# Performance Benchmark Class (To evaluate transformer models)

**Accuracy**: Measures how often the model’s predictions match the true labels

**Model Size**: Refers to disk space the model occupies

**Latency**: Represents the duration required for the model to process inputs or complete tasks

In [None]:
BATCH_SIZE = 48

In [None]:
import torch
import transformers
from transformers import pipeline
import datasets
from datasets import load_metric
# import evaluate

accuracy_score = load_metric('accuracy', trust_remote_code=True)

from tqdm import tqdm

import numpy as np

from pathlib import Path
import time


class PerformanceBenchmark:
    
    def __init__(self, 
                 pipeline: transformers.pipeline, 
                 dataset: datasets.Dataset,
                 model_name: str = 'model.pt') -> None: 
        
        self.model_name = model_name
        self.pipeline = pipeline
        self.dataset = dataset
        
    def compute_size(self) -> dict:
        """
        Computes size of pipeline model.
        """
        model_state_dict = self.pipeline.model.state_dict() # get model's state_dict (all parameters)
        tmp_path = Path(self.model_name)
        torch.save(model_state_dict, tmp_path) # temporarily save the model
        
        model_size = np.round(Path(tmp_path).stat().st_size / (1024 * 1024), 2) # get size of model in MBs
        
        tmp_path.unlink() # deletes the temporarily save model
        
        print(f'Size of Model {self.model_name}: {model_size} MB')
        
        return {'model_size_MBs': model_size}
    
    def compute_accuracy(self) -> dict:
        """
        Computes accuracy score.
        """
        # Collect all texts in a list for batch processing
        texts = [sample['text'] for sample in tqdm(self.dataset, desc="Processing texts")]
        predictions = []
        predictions = self.pipeline(texts)

        preds, labels = [], []
        for prediction, sample in tqdm(zip(predictions, self.dataset), desc="getting preds and labels"):
            pred_label = intents.str2int(prediction['label'])
            true_label = sample['intent']
            preds.append(pred_label)
            labels.append(true_label)
            
#         accuracy_score = evaluate.load('accuracy')
#         accuracy_score.add(predictions=preds,
#                            references=labels)
        accuracy = accuracy_score.compute(predictions=preds,
                                          references=labels)
        print(f'accuracy score: {accuracy}')
        
        return accuracy
    
    def compute_latency(self,
                        query: str = 'How can I find my account PIN?') -> dict:
        """
        Computes execution time for input query.
        """
        
        # warm up phase
        for _ in range(10):
            self.pipeline(query)
            
        # compute latency time
        latencies = []
        for _ in range(100):
            start_time = time.perf_counter()
            self.pipeline(query)
            latency = (time.perf_counter() - start_time)
            latencies.append(latency)
        avg_latency = 1000 * np.mean(latencies)
        std_latency = 1000 * np.std(latencies)
        
        print(f'avg latency: {avg_latency} +\- {std_latency} msec')
        
        return {'avg_latency_msec': avg_latency,
                'std_latency_msec' : std_latency}
    
    def run_benchmark(self) -> dict:
        """
        Run benchmark to compute size, accuracy and latency of pipeline.
        """
        metrics = {}
        
        metrics[self.model_name] = self.compute_size()
        metrics[self.model_name].update(self.compute_latency())
        metrics[self.model_name].update(self.compute_accuracy())
        
        print(f'{self.model_name} metrics: {metrics}')
        
        return metrics

## Benchmark Baseline Transformer pipeline

In [None]:
pb = PerformanceBenchmark(pipe, clinc_ds['test'], model_name=baseline_model_name)
performance_metrics = pb.run_benchmark()
performance_metrics

# Distillation Learning

## DistillationTrainingArguments

In [None]:
from transformers import TrainingArguments

class DistillationTrainingArguments(TrainingArguments):
    
    def __init__(self, *args, alpha=0.5, temperature=2.0, **kwargs):
        super().__init__(*args, **kwargs)
        self.alpha = alpha
        self.temperature = temperature

## DistillationTrainer

In [None]:
import torch
from torch import nn
import torch.nn.functional as F
from transformers import Trainer

class DistillationTrainer(Trainer):
    
    def __init__(self, *args, teacher_model=None, **kwargs):
        super().__init__(*args, **kwargs)
        self.teacher_model = teacher_model
        
    def compute_loss(self, model, inputs, return_outputs=False):
        outputs_student = model(**inputs)
        
        loss_student = outputs_student.loss
        logits_student = outputs_student.logits
        
        with torch.no_grad():
            outputs_teacher = self.teacher_model(**inputs)
            logits_teacher = outputs_teacher.logits
        
        loss_fcn = nn.KLDivLoss(reduction='batchmean')
        loss_kld = ((self.args.temperature ** 2)
                    * loss_fcn(F.log_softmax(logits_student / self.args.temperature, dim=-1),
                               F.softmax(logits_teacher / self.args.temperature, dim=-1)))
        
        final_loss = (self.args.alpha * loss_student + ((1. - self.args.alpha)
                                                         * loss_kld))
        return (final_loss, outputs_student) if return_outputs else final_loss

## Studen Model

## Student Configuration

In [None]:
from transformers import AutoConfig

id2label = pipe.model.config.id2label
label2id = pipe.model.config.label2id

student_model_ckpt = 'distilbert/distilroberta-base'
student_model_name = 'distilroberta-base'
student_config = AutoConfig.from_pretrained(student_model_ckpt, num_labels=intents.num_classes,
                                            id2label=id2label, label2id=label2id)

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

### student_init()
So that each time student model is called for training, a new instance of student is created.

In [None]:
from transformers import AutoModelForSequenceClassification

def student_init():
    return (AutoModelForSequenceClassification
            .from_pretrained(student_model_ckpt, config=student_config)).to(device)

## Teacher Model

In [None]:
from transformers import AutoModelForSequenceClassification

teacher_model_ckpt = baseline_model_ckpt
teacher_model = (AutoModelForSequenceClassification
                 .from_pretrained(teacher_model_ckpt, num_labels=intents.num_classes)).to(device)

## Tokenize Dataset

In [None]:
from transformers import AutoTokenizer

student_tokenizer = AutoTokenizer.from_pretrained(student_model_ckpt)

In [None]:
def tokenize_text(batch):
    return student_tokenizer(batch['text'], 
                             truncation=True)

In [None]:
clinc_encoded = clinc_ds.map(tokenize_text, batched=True, remove_columns='text')
clinc_encoded = clinc_encoded.rename_column('intent', 'labels')
clinc_encoded

In [None]:
print(clinc_encoded['train'][0])

# Knowledge Distillation Training

## Compute Metrics Function

In [None]:
# import evaluate
import numpy as np

def compute_metrics(preds):
    predictions, labels = preds
    predictions = np.argmax(predictions, axis=1)
    
    if len(predictions) != len(labels):
        print(f"Warning: Mismatch in predictions ({len(predictions)}) and labels ({len(labels)}).")
        min_len = min(len(predictions), len(labels))
        predictions = predictions[:min_len]
        labels = labels[:min_len]
    
#     accuracy_score = evaluate.load('accuracy')
    
    return accuracy_score.compute(predictions=predictions,
                                  references=labels)

## Student Training Args

In [None]:
batch_size = 48

student_finetuned_ckpt = f"/kaggle/working/{student_model_name}"
student_training_args = DistillationTrainingArguments(output_dir=student_finetuned_ckpt, 
                                                      eval_strategy="epoch",
                                                      num_train_epochs=5, learning_rate=2e-5,
                                                      warmup_steps=50,
                                                      logging_steps=50,
                                                      per_device_train_batch_size=batch_size,
                                                      per_device_eval_batch_size=batch_size, 
                                                      alpha=1, weight_decay=0.01,
                                                      push_to_hub=False,
                                                      report_to="none",
                                                      save_strategy="no", # do not save model
                                                      save_steps=100_000,
                                                      save_total_limit=None, )
student_finetuned_ckpt

## Student Trainer (without teacher feedback i.e. alpha=1 in KLD Loss)

In [None]:
import warnings
warnings.filterwarnings("ignore")

tmp_stu_model = (AutoModelForSequenceClassification
                 .from_pretrained(student_model_ckpt, config=student_config)).to(device)
student_finetune_trainer = DistillationTrainer(model=tmp_stu_model,
#                                                model_init=tmp_stu_model,
                                               teacher_model=teacher_model, 
                                               args=student_training_args,
                                               train_dataset=clinc_encoded['train'], 
                                               eval_dataset=clinc_encoded['validation'],
                                               compute_metrics=compute_metrics, 
                                               tokenizer=student_tokenizer)
student_finetune_trainer.train()

In [None]:
# Save the model, tokenizer, and configuration
student_model_save_name = f'model_{student_model_name}'
student_model_save_dir = f'/kaggle/working/model_{student_model_name}'
student_finetune_trainer.save_model(student_model_save_dir)  # Save model checkpoint to the specified directory
student_tokenizer.save_pretrained(student_model_save_dir)


## Benchmark Finetuned Student Model

In [None]:
# student_model_name_save = f'model_{student_model_name}'
pipe_fine = pipeline('text-classification', 
                     model=student_model_save_dir,
                     device=device)
pb = PerformanceBenchmark(pipe_fine, clinc_ds['test'], 'tmp_' + student_model_save_name)
performance_metrics.update(pb.run_benchmark())

In [None]:
df = pd.DataFrame.from_dict(performance_metrics, orient='index')
df

In [None]:
import matplotlib.pyplot as plt
import pandas as pd

def plot_metrics(perf_metrics):
    df = pd.DataFrame.from_dict(perf_metrics, orient='index')
    
    for idx in df.index:
        df_model = df.loc[idx]
        plt.scatter(df_model["avg_latency_msec"], df_model["accuracy"] * 100,
                    s=df_model["model_size_MBs"], label=idx, alpha=0.5)

    # Add legend with dynamic spacing
    legend = plt.legend(labelspacing=0.5,
                        handletextpad=0.5,
                        borderaxespad=0.5,
                        loc='upper right')
    for handle in legend.legend_handles:
        handle.set_sizes([30])
    plt.ylim(70, 100)
    xlim = int(perf_metrics[list(df.index)[0]]["avg_latency_msec"] + 10)
    plt.xlim(0, xlim)
    plt.ylabel("Accuracy (%)")
    plt.xlabel("Average latency (ms)")
    plt.tight_layout()
    plt.show()

In [None]:
plot_metrics(performance_metrics)

# Knowledge Distillation: Student learning from Teacher

## HyperParameter Search Optimization using Optuna

In [None]:
import optuna

def hp_space(trial):
    
    return {"num_train_epochs": trial.suggest_int("num_train_epochs", 5, 10),
            "alpha": trial.suggest_float("alpha", 0, 1),
            "temperature": trial.suggest_int("temperature", 1, 20)}

In [None]:
!pip install -U ipywidgets # required for hyperparameter_search

**Note** Hyperparameter search is being carried out for limited number of trials to reduce the time for training for multiple trials. For better results the hyperparameter search may carried out for more trials to get better parameter values.

In [None]:
import warnings
warnings.filterwarnings("ignore")

student_finetune_trainer.model_init = student_init
best_hp_run = student_finetune_trainer.hyperparameter_search(n_trials=3,
                                                             direction='maximize', # to maximize accuracy
                                                             hp_space=hp_space)

In [None]:
best_hp_run.hyperparameters.items()

**Best Parameter after running 3 Trials olny**: 
[('num_train_epochs', 10), ('alpha', 0.13763849494879565), ('temperature', 20)]

In [None]:
for key, value in best_hp_run.hyperparameters.items():
    setattr(student_training_args, key, value)

In [None]:
student_distil_ckpt = f"{student_model_name}-distil"
student_distil_save_dir = f"/kaggle/working/{student_distil_ckpt}"
student_training_args.output_dir = student_distil_save_dir

tmp_stu_model = (AutoModelForSequenceClassification
                 .from_pretrained(student_model_ckpt, config=student_config)).to(device)
student_finetune_trainer = DistillationTrainer(model=tmp_stu_model,
#                                                model_init=tmp_stu_model,
                                               teacher_model=teacher_model, 
                                               args=student_training_args,
                                               train_dataset=clinc_encoded['train'], 
                                               eval_dataset=clinc_encoded['validation'],
                                               compute_metrics=compute_metrics, 
                                               tokenizer=student_tokenizer)
student_finetune_trainer.train()


In [None]:
# Save the model, tokenizer, and configuration
distil_student_model_save_name = f'model_distil_{student_model_name}'
distil_student_model_save_dir = f'/kaggle/working/model_distil_{student_model_name}'
student_finetune_trainer.save_model(distil_student_model_save_dir)  # Save model checkpoint to the specified directory
student_tokenizer.save_pretrained(distil_student_model_save_dir)


In [None]:
distil_student_model_save_name

In [None]:
pipe_fine = pipeline('text-classification', 
                     model=distil_student_model_save_dir,
                     device=device)
pb = PerformanceBenchmark(pipe_fine, clinc_ds['test'], 'tmp_' + distil_student_model_save_name)
performance_metrics.update(pb.run_benchmark())

df = pd.DataFrame.from_dict(performance_metrics, orient='index')
print(df)
plot_metrics(performance_metrics)

# Model Quantization
* Model quantization is a powerful tool for optimizing machine learning models for deployment in **resource-constrained environments**.
* It does this by converting the model's weights and sometimes activations **from higher precision** (e.g., 32-bit floating-point, FP32) **to lower precision** (e.g., 16-bit floating-point, **FP16**, or 8-bit integers, **INT8**).


## Quantization Types
**Post-Training Quantization**:

    Applied after the model is fully trained.
    Converts weights and/or activations to lower precision without retraining.
    Subtypes:
        Dynamic Quantization: Activations are quantized dynamically during inference.
        Static Quantization: A calibration step is used to determine ranges for activations.

**Quantization-Aware Training**:

    Simulates quantization during training to account for any accuracy loss.
    Typically offers better accuracy than post training quantization, especially for complex models.

**Note** Quantization of the model is done on 'cpu'
Reference: 

In [None]:
from torch.quantization import quantize_dynamic
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification

model_ckpt = distil_student_model_save_dir
quantized_model_name = f'distil-quantized'
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)
model = (AutoModelForSequenceClassification
         .from_pretrained(model_ckpt)
         .to("cpu")) # because the model has been quantized so device is set to cpu

quantized_model = quantize_dynamic(model, {torch.nn.Linear}, dtype=torch.qint8)

In [None]:
quantized_model.device

## Benchmark Quantized Model

In [None]:
pipe_fine_quantized = pipeline('text-classification', 
                               model=quantized_model,
                               tokenizer=tokenizer,
                               device="cpu")
pb = PerformanceBenchmark(pipe_fine_quantized, clinc_ds['test'], 'tmp_' + quantized_model_name)
performance_metrics.update(pb.run_benchmark())

df = pd.DataFrame.from_dict(performance_metrics, orient='index')
print(df)
plot_metrics(performance_metrics)

**Note** Quantized model was validated on CPU

# ONNX for Model Inference Optimization
* ONNX (Open Neural Network Exchange)
* ONNX allows you to take a model trained in one environment and run it in many others
* standardized way to describe models
* Once a model is converted to ONNX format, it can be optimized for inference in various ways, such as reducing model size
* ONNX can be used with **ONNX Runtime**, an inference engine optimized for running ONNX models.

In [None]:
# import os
# from psutil import cpu_count
# os.environ["OMP_NUM_THREADS"] = f"{cpu_count()}" # to utilize all available cores to maximize parallel processing.
# os.environ["OMP_WAIT_POLICY"] = "ACTIVE" # ensures that threads remain in an active, busy-wait state, can reduce latency

In [None]:
# from transformers.convert_graph_to_onnx import convert

# onnx_model_path = Path("/kaggle/working/onnx/model.onnx")
# convert(framework="pt", model=model_ckpt, tokenizer=tokenizer,
#         output=onnx_model_path, opset=13, pipeline_name="text-classification") # opset 13 is chosen because it is a stable, widely-supported version