# Model Export and Optimization Pipeline

This notebook exports and quantises a pre-trained sentiment analysis model through the following steps:

1. Loading a PyTorch model and tokeniser
2. Converting to ONNX Runtime format for improved inference performance
3. Applying quantization for model optimization
4. Comparing inference speed and outputs between:
   - Original PyTorch model
   - ONNX Runtime model 
   - Quantized model

The notebook includes a Gradio interface for interactive testing of all three model variants.

In [None]:
from optimum.onnxruntime import ORTModelForSequenceClassification, ORTQuantizer
from optimum.onnxruntime.configuration import AutoQuantizationConfig
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
import gradio as gr
import pandas as pd
from time import perf_counter

In [None]:
model_path = "models/best/"

pytorch_model = AutoModelForSequenceClassification.from_pretrained(model_path, local_files_only=True)

tokeniser = AutoTokenizer.from_pretrained("microsoft/MiniLM-L12-H384-uncased")
tokeniser.save_pretrained("models/tokeniser/")

In [None]:
ort_model = ORTModelForSequenceClassification.from_pretrained(
    model_path,
    export=True,
    provider="CPUExecutionProvider"
)
ort_model.save_pretrained("models/ort/")

In [None]:
quantization_config = AutoQuantizationConfig.avx512_vnni(is_static=False, per_channel=False)
quantized_model_path = "models/quantized_model/"
quantizer = ORTQuantizer.from_pretrained(ort_model)

quantizer.quantize(save_dir=quantized_model_path, quantization_config=quantization_config)

quantized_model = ORTModelForSequenceClassification.from_pretrained(quantized_model_path, local_files_only=True)

In [None]:
def run_model(text_in: str, model: any) -> tuple:
    """
    Processes input text through a machine learning model and returns the output and execution time.

    This function tokenises the input text using a tokeniser, processes the tokenised output through
    the provided model, and calculates the time taken for model inference. The output logits of the
    model are then rounded and returned alongside the execution time.

    :param text_in: Input text that needs to be processed by the model.
    :type text_in: str
    :param model: A pre-trained machine learning model capable of inference.
    :type model: any
    :return: A tuple containing the processed output and the time taken for execution.
    :rtype: tuple
    """
    tokenised_text = tokeniser(
        text_in,
        truncation=True,
        padding=True,
        max_length=512,
        return_tensors="pt"
    )

    start_time = perf_counter()
    with torch.no_grad():
        out = model(**tokenised_text)
        execution_time = round(perf_counter() - start_time, 5)
        out = round(out.logits.squeeze().item(), 5)

    return out, execution_time

In [None]:
def compare_models(text_in: str) -> pd.DataFrame:
    """
    Compares the performance of multiple models by processing the input text through
    each model and returning a DataFrame with the results. Each model's output and
    execution time are recorded and organised for analysis.

    :param text_in: The text input to be processed by each model
    :type text_in: str
    :return: A pandas DataFrame containing the model name, its output, and the
        corresponding execution time
    :rtype: pd.DataFrame
    """
    data = [
        ["PyTorch", *run_model(text_in, pytorch_model)],
        ["ONNX", *run_model(text_in, ort_model)],
        ["AutoQuantization", *run_model(text_in, quantized_model)]
    ]

    return pd.DataFrame(data, columns=["Model", "Output", "Time"])

### Model Selection

...

In [None]:
with gr.Blocks() as demo:
    with gr.Row():
        with gr.Column():
            text = gr.Textbox(placeholder="Paste a headline here.")
            run_button = gr.Button("Run")
        output_table = gr.DataFrame()

    run_button.click(fn=compare_models, inputs=text, outputs=output_table)

demo.launch()