In [None]:
import matplotlib.pyplot as plt
from transformers import pipeline
import torch
from torch import quantize_per_tensor

bert_ckpt = "transformersbook/distilbert-base-uncased-finetuned-clinc"
pipe = pipeline("text-classification", model=bert_ckpt)
state_dict = pipe.model.state_dict()
state_dict.keys()

weights = state_dict["distilbert.transformer.layer.0.attention.out_lin.weight"]
scale = (weights.max() - weights.min()) / 255
zero_point = 0
dtype = torch.qint8
quantized_weights = quantize_per_tensor(weights, scale, zero_point, dtype)

fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(20, 10))
ax1.set_title('before quantization')
ax2.set_title('after quantization')

ax1.hist(weights.flatten().numpy(), bins=250, range=(-0.3, 0.3))
ax2.hist(quantized_weights.flatten().dequantize().numpy(), bins=250, range=(-0.3, 0.3))
plt.show()

In [None]:
weights = state_dict["distilbert.transformer.layer.0.attention.out_lin.weight"]
scale = (weights.max() - weights.min()) / 255
zero_point = 0
dtype = torch.qint8
quantized_weights = quantize_per_tensor(weights, scale, zero_point, dtype)

In [None]:
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(20, 10))
ax1.set_title('before quantization')
ax2.set_title('after quantization')

ax1.hist(weights.flatten().numpy(), bins=250, range=(-0.3, 0.3))
ax2.hist(quantized_weights.flatten().dequantize().numpy(), bins=250, range=(-0.3, 0.3))
# plt.axis('off')
plt.show()

In [None]:
%%timeit -r 10 -n 1000
weights @ weights

In [None]:
from torch.nn.quantized import QFunctional

q_fn = QFunctional()

In [None]:
%%timeit -r 10 -n 1000
q_fn.mul(quantized_weights, quantized_weights)

In [None]:
from torch.quantization import quantize_dynamic
from torch import nn
from transformers import AutoTokenizer, AutoModelForSequenceClassification

model_ckpt = 'transformersbook/bert-base-uncased-finetuned-clinc'
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)
model = AutoModelForSequenceClassification.from_pretrained(model_ckpt).to('cpu')
model_quantized = quantize_dynamic(model, {nn.Linear}, dtype=torch.qint8)

In [None]:
from datasets import load_metric
from pathlib import Path
from time import perf_counter
import numpy as np
from datasets import load_dataset

clinc = load_dataset("clinc_oos", "plus")
intents = clinc["test"].features["intent"]
accuracy_score = load_metric("accuracy")

class PerformanceBenchmark :
    def __init__(self, pipeline, dataset, optim_type="BERT baseline") :
        self.pipeline = pipeline
        self.dataset = dataset
        self.optim_type = optim_type
        
    def compute_accuracy(self) :
        preds, labels = [], []
        for example in self.dataset :
            pred = self.pipeline(example["text"])[0]["label"]
            label = example["intent"]
            preds.append(intents.str2int(pred))
            labels.append(label)
        accuracy = accuracy_score.compute(predictions=preds, references=labels)
        print(f"valid accuracy : {accuracy['accuracy']:.4f}")
        return accuracy
    
    def compute_size(self) :
        state_dict = self.pipeline.model.state_dict()
        tmp_path = Path("model.pt")
        torch.save(state_dict, tmp_path)
        size_mb = Path(tmp_path).stat().st_size / (1024*1024)
        tmp_path.unlink()
        print(f"model size : {size_mb:.4f} MB")
        return {"size_mb": size_mb}
            
    def time_pipeline(self, query="What is the pin number for my account?") :
        latencies = []
        for _ in range(10) :
            _ = self.pipeline(query)
        for _ in range(100) :
            start_time = perf_counter()
            _ = self.pipeline(query)
            latency = perf_counter() - start_time
            latencies.append(latency)
        time_avg_ms = 1000 * np.mean(latencies)
        time_std_ms = 1000 * np.std(latencies)
        print(f"time avg : {time_avg_ms:.4f} ms +\- {time_std_ms:.4f} ms")
        return {"time_avg_ms": time_avg_ms, "time_std_ms": time_std_ms}    
    
    def run_benchmark(self) :
        metrics = {}
        metrics[self.optim_type] = self.compute_size()
        metrics[self.optim_type].update(self.time_pipeline())
        metrics[self.optim_type].update(self.compute_accuracy())
        return metrics

In [None]:
# Original
pipe = pipeline("text-classification", model=model, tokenizer=tokenizer)
optim_type = "normal"
pb = PerformanceBenchmark(pipe, clinc["test"], optim_type)
perf_metrics = pb.run_benchmark()

In [None]:
# Quantized
pipe = pipeline("text-classification", model=model_quantized, tokenizer=tokenizer)
optim_type = "quantization"
pb = PerformanceBenchmark(pipe, clinc["test"], optim_type)
perf_metrics.update(pb.run_benchmark())

In [None]:
perf_metrics

In [None]:
from transformers import TrainingArguments

class DistilTrainingArguments(TrainingArguments):
    def __init__(self, *args, alpha=0.5, temperature=2.0, **kwargs):
        super().__init__(*args, **kwargs)
        self.alpha = alpha
        self.temperature = temperature
        
import torch.nn as nn
import torch.nn.functional as F
from transformers import Trainer

class DistilTrainer(Trainer):
    def __init__(self, *args, teacher=None, **kwargs):
        super().__init__(*args, **kwargs)
        self.teacher = teacher
        
    def compute_loss(self, model, inputs, return_outputs=False):
        # student의 예측 결과
        outputStudents = model(**inputs)
        
        # student의 loss와 logits        
        loss_ce = outputStudents.loss
        logit_stu = outputStudents.logits
        
        # teacher의 logits
        with torch.no_grad():
            ouputTeacher = self.teacher(**inputs)
            logit_tea = ouputTeacher.logits
            
        # Gumbel-Softmax
        loss_fct = nn.KLDivLoss(reduction='batchmean')
        temperature = self.args.temperature
        loss_kd = temperature**2 * loss_fct(F.log_softmax(logit_stu/temperature , dim=-1), F.softmax(logit_tea/temperature, dim=-1))
        
        # return averaged student loss
        loss = self.args.alpha * loss_ce + (1 - self.args.alpha) * loss_kd
        
        return (loss, outputStudents) if return_outputs else loss

In [None]:
from datasets import load_dataset, load_metric
from transformers import AutoModelForSequenceClassification

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

clinc = load_dataset("clinc_oos", "plus")
intents = clinc["test"].features["intent"]
num_labels = intents.num_classes

accuracy_score = load_metric("accuracy")

student_ckpt = "distilbert-base-uncased"
teacher_ckpt = "bert-base-uncased"

student = (AutoModelForSequenceClassification.from_pretrained(student_ckpt, num_labels=num_labels).to(device))
teacher = (AutoModelForSequenceClassification.from_pretrained(teacher_ckpt, num_labels=num_labels).to(device))

print(f"teacher 대비 student의 파라미터 비율: {student.num_parameters() / teacher.num_parameters() * 100:.4f}%")

In [None]:
from transformers import AutoTokenizer
from datasets import load_dataset, load_metric
import numpy as np

clinc = load_dataset("clinc_oos", "plus")
intents = clinc["test"].features["intent"]
accuracy_score = load_metric("accuracy")

student_ckpt = "distilbert-base-uncased"
student_tokenizer = AutoTokenizer.from_pretrained(student_ckpt)

def tokenize_text(batch):
    return student_tokenizer(batch["text"], truncation=True)

clinc_enc = clinc.map(tokenize_text, batched=True, remove_columns=['text'])
clinc_enc = clinc_enc.rename_column('intent', 'labels')

def compute_metrics(pred):
    preds, labels = pred
    preds = np.argmax(preds, axis=1)
    return accuracy_score.compute(predictions=preds, references=labels)

In [None]:
batch_size = 128

finetuned_ckpt = "distilbert-base-uncased-finetuned-clinc"
student_training_args = DistilTrainingArguments(output_dir=finetuned_ckpt, 
                                                 evaluation_strategy='epoch', 
                                                 num_train_epochs=100,
                                                 logging_steps=100,
                                                 learning_rate=2e-5, 
                                                 per_device_train_batch_size=batch_size, 
                                                 per_device_eval_batch_size=batch_size, 
                                                 alpha=1, 
                                                 temperature=2,
                                                 weight_decay=0.01, 
                                                 push_to_hub=False)

In [None]:
import torch
from transformers import pipeline, AutoConfig, AutoModelForSequenceClassification

bert_ckpt = "transformersbook/distilbert-base-uncased-finetuned-clinc"
pipe = pipeline("text-classification", model=bert_ckpt)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

id2label = pipe.model.config.id2label
label2id = pipe.model.config.label2id

num_labels = intents.num_classes
student_config = (AutoConfig.from_pretrained(student_ckpt, num_labels=num_labels, id2label=id2label, label2id=label2id))

def student_init():
    return (AutoModelForSequenceClassification.from_pretrained(student_ckpt, config=student_config).to(device))

In [None]:
teacher_ckpt = "transformersbook/bert-base-uncased-finetuned-clinc"
teacher = (AutoModelForSequenceClassification.from_pretrained(teacher_ckpt, num_labels=num_labels).to(device))

distilbert_trainer = DistilTrainer(model_init=student_init, 
                                    teacher=teacher, 
                                    args=student_training_args, 
                                    train_dataset=clinc_enc['train'],
                                    eval_dataset=clinc_enc['validation'],
                                    compute_metrics=compute_metrics,
                                    tokenizer=student_tokenizer)

distilbert_trainer.train()

In [None]:
batch_size = 512

finetuned_ckpt = "distilbert-base-uncased-finetuned-clinc"
student_training_args = DistilTrainingArguments(output_dir=finetuned_ckpt, 
                                                 evaluation_strategy='epoch', 
                                                 num_train_epochs=100,
                                                 logging_steps=100,
                                                 learning_rate=2e-5, 
                                                 per_device_train_batch_size=batch_size, 
                                                 per_device_eval_batch_size=batch_size, 
                                                 alpha=0.7, 
                                                 temperature=2,
                                                 weight_decay=0.01, 
                                                 push_to_hub=False)

teacher_ckpt = "transformersbook/bert-base-uncased-finetuned-clinc"
teacher = (AutoModelForSequenceClassification.from_pretrained(teacher_ckpt, num_labels=num_labels).to(device))

distilbert_trainer = DistilTrainer(model_init=student_init, 
                                    teacher=teacher, 
                                    args=student_training_args, 
                                    train_dataset=clinc_enc['train'],
                                    eval_dataset=clinc_enc['validation'],
                                    compute_metrics=compute_metrics,
                                    tokenizer=student_tokenizer)

distilbert_trainer.train()

In [None]:
batch_size = 512

finetuned_ckpt = "distilbert-base-uncased-finetuned-clinc"
student_training_args = DistilTrainingArguments(output_dir=finetuned_ckpt, 
                                                 evaluation_strategy='epoch', 
                                                 num_train_epochs=100,
                                                 logging_steps=100,
                                                 learning_rate=2e-5, 
                                                 per_device_train_batch_size=batch_size, 
                                                 per_device_eval_batch_size=batch_size, 
                                                 alpha=0.7, 
                                                 temperature=10,
                                                 weight_decay=0.01, 
                                                 push_to_hub=False)

teacher_ckpt = "transformersbook/bert-base-uncased-finetuned-clinc"
teacher = (AutoModelForSequenceClassification.from_pretrained(teacher_ckpt, num_labels=num_labels).to(device))

distilbert_trainer = DistilTrainer(model_init=student_init, 
                                    teacher=teacher, 
                                    args=student_training_args, 
                                    train_dataset=clinc_enc['train'],
                                    eval_dataset=clinc_enc['validation'],
                                    compute_metrics=compute_metrics,
                                    tokenizer=student_tokenizer)

distilbert_trainer.train()

In [None]:
batch_size = 512

finetuned_ckpt = "distilbert-base-uncased-finetuned-clinc"
student_training_args = DistilTrainingArguments(output_dir=finetuned_ckpt, 
                                                 evaluation_strategy='epoch', 
                                                 num_train_epochs=200,
                                                 logging_steps=100,
                                                 learning_rate=2e-5, 
                                                 per_device_train_batch_size=batch_size, 
                                                 per_device_eval_batch_size=batch_size, 
                                                 alpha=0.5, 
                                                 temperature=5,
                                                 weight_decay=0.01, 
                                                 push_to_hub=False)

teacher_ckpt = "transformersbook/bert-base-uncased-finetuned-clinc"
teacher = (AutoModelForSequenceClassification.from_pretrained(teacher_ckpt, num_labels=num_labels).to(device))

distilbert_trainer = DistilTrainer(model_init=student_init, 
                                    teacher=teacher, 
                                    args=student_training_args, 
                                    train_dataset=clinc_enc['train'],
                                    eval_dataset=clinc_enc['validation'],
                                    compute_metrics=compute_metrics,
                                    tokenizer=student_tokenizer)

distilbert_trainer.train()

In [None]:
from datasets import load_metric
from pathlib import Path
from time import perf_counter
import numpy as np
from datasets import load_dataset

clinc = load_dataset("clinc_oos", "plus")
intents = clinc["test"].features["intent"]
accuracy_score = load_metric("accuracy")

class PerformanceBenchmark :
    def __init__(self, pipeline, dataset, optim_type="BERT baseline") :
        self.pipeline = pipeline
        self.dataset = dataset
        self.optim_type = optim_type
        
    def compute_accuracy(self) :
        preds, labels = [], []
        for example in self.dataset :
            pred = self.pipeline(example["text"])[0]["label"]
            label = example["intent"]
            preds.append(intents.str2int(pred))
            labels.append(label)
        accuracy = accuracy_score.compute(predictions=preds, references=labels)
        print(f"valid accuracy : {accuracy['accuracy']:.4f}")
        return accuracy
    
    def compute_size(self) :
        state_dict = self.pipeline.model.state_dict()
        tmp_path = Path("model.pt")
        torch.save(state_dict, tmp_path)
        size_mb = Path(tmp_path).stat().st_size / (1024*1024)
        tmp_path.unlink()
        print(f"model size : {size_mb:.4f} MB")
        return {"size_mb": size_mb}
            
    def time_pipeline(self, query="What is the pin number for my account?") :
        latencies = []
        for _ in range(10) :
            _ = self.pipeline(query)
        for _ in range(100) :
            start_time = perf_counter()
            _ = self.pipeline(query)
            latency = perf_counter() - start_time
            latencies.append(latency)
        time_avg_ms = 1000 * np.mean(latencies)
        time_std_ms = 1000 * np.std(latencies)
        print(f"time avg : {time_avg_ms:.4f} ms +\- {time_std_ms:.4f} ms")
        return {"time_avg_ms": time_avg_ms, "time_std_ms": time_std_ms}    
    
    def run_benchmark(self) :
        metrics = {}
        metrics[self.optim_type] = self.compute_size()
        metrics[self.optim_type].update(self.time_pipeline())
        metrics[self.optim_type].update(self.compute_accuracy())
        return metrics

In [None]:
import torch
from torch import nn
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline

# Original
model_ckpt = 'transformersbook/bert-base-uncased-finetuned-clinc'
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)
model = AutoModelForSequenceClassification.from_pretrained(model_ckpt).to('cpu')

pipe = pipeline("text-classification", model=model, tokenizer=tokenizer)
optim_type = "original-BERT"
# pb = PerformanceBenchmark(pipe, clinc["validation"], optim_type)
# perf_metrics = pb.run_benchmark()

In [None]:
# Distilation Model & tokenizer

model_ckpt = './distilbert-base-uncased-finetuned-clinc/checkpoint-500'
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)
model = AutoModelForSequenceClassification.from_pretrained(model_ckpt).to('cpu')

pipe = pipeline("text-classification", model=model, tokenizer=tokenizer)
optim_type = "distilBert"
# pb = PerformanceBenchmark(pipe, clinc["test"], optim_type)
# perf_metrics = pb.run_benchmark()

In [None]:
# Load Original Model & Tokenizer
model_ckpt = 'transformersbook/bert-base-uncased-finetuned-clinc'
tokenizer_original = AutoTokenizer.from_pretrained(model_ckpt)
model_original = AutoModelForSequenceClassification.from_pretrained(model_ckpt).to('cpu')

# Load Distillation Model
model_ckpt = './distilbert-base-uncased-finetuned-clinc/checkpoint-500'
tokenizer_distil = AutoTokenizer.from_pretrained(model_ckpt)
model_distil = AutoModelForSequenceClassification.from_pretrained(model_ckpt).to('cpu')

In [None]:
# Original Model test
pipe_original = pipeline("text-classification", model=model_original, tokenizer=tokenizer_original)
optim_type = "original"
# pb = PerformanceBenchmark(pipe_original, clinc["test"], optim_type)
# perf_metrics = pb.run_benchmark()

In [None]:
# Distillation Model test
pipe_distil = pipeline("text-classification", model=model_distil, tokenizer=tokenizer_distil)
optim_type = "distil"
# pb = PerformanceBenchmark(pipe_distil, clinc["test"], optim_type)
# perf_metrics.update(pb.run_benchmark())

In [None]:
# Quantized
from torch.quantization import quantize_dynamic

model_quantized = quantize_dynamic(model_original, {nn.Linear}, dtype=torch.qint8)
pipe = pipeline("text-classification", model=model_quantized, tokenizer=tokenizer)
optim_type = "quantized"
# pb = PerformanceBenchmark(pipe, clinc["test"], optim_type)
# perf_metrics.update(pb.run_benchmark())

In [None]:
# Distilation + Quantized
from torch.quantization import quantize_dynamic

model_quantized2 = quantize_dynamic(model_distil, {nn.Linear}, dtype=torch.qint8)
pipe = pipeline("text-classification", model=model_quantized2, tokenizer=tokenizer)
optim_type = "distil+quantized"
# pb = PerformanceBenchmark(pipe, clinc["test"], optim_type)
# perf_metrics.update(pb.run_benchmark())

In [None]:
import os
from psutil import cpu_count

os.environ["OMP_NUM_THREADS"] = str(cpu_count())
os.environ["OMP_WAIT_POLICY"] = "ACTIVE"

In [None]:
from transformers.convert_graph_to_onnx import convert
from pathlib import Path

onnx_model_path = Path("onnx/model.onnx")
convert(framework="pt", 
        model=model_distil, # model의 경로(local/hub) 혹은 로드한 모델
        tokenizer=tokenizer, 
        output=onnx_model_path, 
        opset=17, # onnx 라이브러리의 특정 버전
        pipeline_name='text-classification')

In [None]:
from onnxruntime import GraphOptimizationLevel, InferenceSession, SessionOptions

def create_model_for_provider(model_path, provider=["CPUExecutionProvider"]):
    options = SessionOptions()
    options.intra_op_num_threads = 32
    options.graph_optimization_level = GraphOptimizationLevel.ORT_ENABLE_ALL
    
    session = InferenceSession(str(model_path), options, providers=provider)
    session.disable_fallback()
    
    return session

In [None]:
from scipy.special import softmax

class OnnxPipeline:
    def __init__(self, model, tokenizer) -> None:
        self.model = model
        self.tokenizer = tokenizer
    
    def __call__(self, query) -> list:
        model_inputs = self.tokenizer(query, return_tensors="pt")
        inputs_onnx = {k: v.cpu().detach().numpy() for k, v in model_inputs.items()}
        logits = self.model.run(None, inputs_onnx)[0][0,:]
        probs = softmax(logits)
        pred_idx = np.argmax(probs).item()
        
        return [{"label":intents.int2str(pred_idx), "score":probs[pred_idx]}]

In [None]:
class OnnxPerformanceBenchmark(PerformanceBenchmark):
    def __init__(self, *args, model_path, **kwargs):
        super().__init__(*args, **kwargs)
        self.model_path = model_path
    
    def compute_size(self):
        size_mb = Path(self.model_path).stat().st_size / 1024 / 1024
        print(f"Model size: {size_mb:.2f} MB")
        return {"size_mb": size_mb}

In [None]:
provider = [
    ('CUDAExecutionProvider', {
        'device_id': 0,
        'arena_extend_strategy': 'kNextPowerOfTwo',
        'gpu_mem_limit': 16 * 1024 * 1024 * 1024,
        'cudnn_conv_algo_search': 'EXHAUSTIVE',
        'do_copy_in_default_stream': True,
    }),
    'CPUExecutionProvider',
]

onnx_model = create_model_for_provider(onnx_model_path, provider=provider)
onnx_pipe = OnnxPipeline(onnx_model, tokenizer)

optim_type = "Distil + ORT"
pb = OnnxPerformanceBenchmark(onnx_pipe, clinc['test'], optim_type, model_path='onnx/model.onnx')
perf_metrics = pb.run_benchmark()

In [None]:
provider = ['CUDAExecutionProvider', 'CPUExecutionProvider']

onnx_model = create_model_for_provider(onnx_model_path, provider=provider)
onnx_pipe = OnnxPipeline(onnx_model, tokenizer)

optim_type = "Distil + ORT"
pb = OnnxPerformanceBenchmark(onnx_pipe, clinc['test'], optim_type, model_path='onnx/model.onnx')
perf_metrics = pb.run_benchmark()

In [None]:
onnx_model = create_model_for_provider(onnx_model_path, provider=['CPUExecutionProvider'])
onnx_pipe = OnnxPipeline(onnx_model, tokenizer)

optim_type = "Distil + ORT"
pb = OnnxPerformanceBenchmark(onnx_pipe, clinc['test'], optim_type, model_path='onnx/model.onnx')
perf_metrics.update(pb.run_benchmark())

In [None]:
from onnxruntime.quantization import quantize_dynamic, QuantType

onnx_model = "onnx/model.onnx"
quant_onnx_model = "onnx/model.quant.onnx"

quantize_dynamic(model_input=onnx_model, model_output=quant_onnx_model, weight_type=QuantType.QInt8)

In [None]:
onnx_quantized_model = create_model_for_provider(quant_onnx_model)
pipe = OnnxPipeline(onnx_quantized_model, tokenizer)
optim_type = "Distil+ORT(with Quantized)"

pb = OnnxPerformanceBenchmark(pipe, clinc["test"], optim_type, model_path=quant_onnx_model)
perf_metrics.update(pb.run_benchmark())

In [None]:
provider = ['CUDAExecutionProvider', 'CPUExecutionProvider']

onnx_quantized_model = create_model_for_provider(quant_onnx_model, provider=provider)
pipe = OnnxPipeline(onnx_quantized_model, tokenizer)
optim_type = "Distil+ORT(with Quantized)"

pb = OnnxPerformanceBenchmark(pipe, clinc["test"], optim_type, model_path=quant_onnx_model)
perf_metrics.update(pb.run_benchmark())

In [None]:
provider = [
    ('CUDAExecutionProvider', {
        'device_id': 0,
        'arena_extend_strategy': 'kNextPowerOfTwo',
        'gpu_mem_limit': 16 * 1024 * 1024 * 1024,
        'cudnn_conv_algo_search': 'EXHAUSTIVE',
        'do_copy_in_default_stream': True,
    }),
    'CPUExecutionProvider',
]

onnx_quantized_model = create_model_for_provider(quant_onnx_model, provider=provider)
pipe = OnnxPipeline(onnx_quantized_model, tokenizer)
optim_type = "Distil+ORT(with Quantized)"

pb = OnnxPerformanceBenchmark(pipe, clinc["test"], optim_type, model_path=quant_onnx_model)
perf_metrics.update(pb.run_benchmark())

In [None]:
from datasets import load_metric
from pathlib import Path
from time import perf_counter
import numpy as np
from datasets import load_dataset

clinc = load_dataset("clinc_oos", "plus")
intents = clinc["test"].features["intent"]
accuracy_score = load_metric("accuracy")

class PerformanceBenchmark :
    def __init__(self, pipeline, dataset, optim_type="BERT baseline") :
        self.pipeline = pipeline
        self.dataset = dataset
        self.optim_type = optim_type
        
    def compute_accuracy(self) :
        preds, labels = [], []
        for example in self.dataset :
            pred = self.pipeline(example["text"])[0]["label"]
            label = example["intent"]
            preds.append(intents.str2int(pred))
            labels.append(label)
        accuracy = accuracy_score.compute(predictions=preds, references=labels)
        print(f"valid accuracy : {accuracy['accuracy']:.4f}")
        return accuracy
    
    def compute_size(self) :
        state_dict = self.pipeline.model.state_dict()
        tmp_path = Path("model.pt")
        torch.save(state_dict, tmp_path)
        size_mb = Path(tmp_path).stat().st_size / (1024*1024)
        tmp_path.unlink()
        print(f"model size : {size_mb:.4f} MB")
        return {"size_mb": size_mb}
            
    def time_pipeline(self, query="What is the pin number for my account?") :
        latencies = []
        for _ in range(10) :
            _ = self.pipeline(query)
        for _ in range(100) :
            start_time = perf_counter()
            _ = self.pipeline(query)
            latency = perf_counter() - start_time
            latencies.append(latency)
        time_avg_ms = 1000 * np.mean(latencies)
        time_std_ms = 1000 * np.std(latencies)
        print(f"time avg : {time_avg_ms:.4f} ms +\- {time_std_ms:.4f} ms")
        return {"time_avg_ms": time_avg_ms, "time_std_ms": time_std_ms}    
    
    def run_benchmark(self) :
        metrics = {}
        metrics[self.optim_type] = self.compute_size()
        metrics[self.optim_type].update(self.time_pipeline())
        metrics[self.optim_type].update(self.compute_accuracy())
        return metrics

In [None]:
import torch
from torch import nn
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline

device = torch.device('cuda')

# Original
model_ckpt = 'transformersbook/bert-base-uncased-finetuned-clinc'
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)
model = AutoModelForSequenceClassification.from_pretrained(model_ckpt).to('cuda')

pipe = pipeline("text-classification", model=model, tokenizer=tokenizer, device=0)
optim_type = "original-BERT"
pb = PerformanceBenchmark(pipe, clinc["test"], optim_type)
perf_metrics = pb.run_benchmark()

In [None]:
# Distilation Model & tokenizer

model_ckpt = './distilbert-base-uncased-finetuned-clinc/checkpoint-500'
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)
model = AutoModelForSequenceClassification.from_pretrained(model_ckpt)

pipe = pipeline("text-classification", 
                model=model, 
                tokenizer=tokenizer, 
                device=0)
optim_type = "distilBert"
pb = PerformanceBenchmark(pipe, clinc["test"], optim_type)
perf_metrics = pb.run_bnechmark()

In [None]:
# Load Original Model & Tokenizer
model_ckpt = 'transformersbook/bert-base-uncased-finetuned-clinc'
tokenizer_original = AutoTokenizer.from_pretrained(model_ckpt)
model_original = AutoModelForSequenceClassification.from_pretrained(model_ckpt).to('cuda')

# Load Distillation Model
model_ckpt = './distilbert-base-uncased-finetuned-clinc/checkpoint-500'
tokenizer_distil = AutoTokenizer.from_pretrained(model_ckpt)
model_distil = AutoModelForSequenceClassification.from_pretrained(model_ckpt).to('cuda')

In [None]:
# Original Model test
pipe_original = pipeline("text-classification", model=model_original, tokenizer=tokenizer_original, device=0)
optim_type = "original"
pb = PerformanceBenchmark(pipe_original, clinc["test"], optim_type)
perf_metrics = pb.run_benchmark()

In [None]:
# Distillation Model test
pipe_distil = pipeline("text-classification", model=model_distil, tokenizer=tokenizer_distil, device=0)
optim_type = "distil"
pb = PerformanceBenchmark(pipe_distil, clinc["test"], optim_type)
perf_metrics.update(pb.run_benchmark())

In [None]:
# Quantized
from torch.quantization import quantize_dynamic

model_quantized = quantize_dynamic(model_original, dtype=torch.qint8)
pipe = pipeline("text-classification", model=model_quantized, tokenizer=tokenizer, device=0)
optim_type = "quantized"
pb = PerformanceBenchmark(pipe, clinc["test"], optim_type)
perf_metrics.update(pb.run_benchmark())

In [None]:
# Distilation + Quantized
from torch.quantization import quantize_dynamic

model_quantized2 = quantize_dynamic(model_distil, {nn.Linear}, dtype=torch.qint8)
pipe = pipeline("text-classification", model=model_quantized2, tokenizer=tokenizer, device=0)
optim_type = "distil+quantized"
pb = PerformanceBenchmark(pipe, clinc["test"], optim_type)
perf_metrics.update(pb.run_benchmark())