# [DRAFT] Quantize a Hugging Face Model with NNCF

This tutorial shows how to quantize a model from [Hugging Face](https://huggingface.co/models) with [NNCF](https://github.com/openvinotoolkit/nncf), using the [OpenVINO Integration for Hugging Face Optimum](https://github.com/openvinotoolkit/openvino_contrib/tree/master/modules/optimum)

<strong>This notebook uses a legacy integration for OpenVINO into Hugging Face Optimum. For reference only.</strong>

To install dependencies for this notebook, please follow instructions on https://github.com/openvinotoolkit/openvino_contrib/tree/master/modules/optimum to install with `[all]` and install PyTorch separately.

Tested with Python 3.8, PyTorch, 1.9.1, Transformers 4.15

In [1]:
import json
import logging
import os
import time
import xml.etree.ElementTree as ET
from pathlib import Path

import numpy as np
import pandas as pd
import torch
from datasets import load_dataset, load_metric
from nncf.common.utils.logger import set_log_level

# Should be imported before transformers
from optimum.intel.nncf import NNCFAutoConfig
from optimum.intel.openvino import OVAutoModelForQuestionAnswering
from transformers import (
    AutoConfig,
    AutoModelForQuestionAnswering,
    AutoTokenizer,
    DataCollatorWithPadding,
    EvalPrediction,
    TrainingArguments,
    default_data_collator,
    set_seed,
)

# Local imports
from benchmark_utils import benchmark_model
from bert_utils import prepare_train_features, prepare_validation_features
from trainer_qa import QuestionAnsweringTrainer
from utils_qa import postprocess_qa_predictions

set_log_level(logging.ERROR)
set_seed(1)

  from .autonotebook import tqdm as notebook_tqdm


## Settings

In [2]:
model_name = "aware-ai/roberta-large-squadv2"

## NNCF

### NNCF Model

In [3]:
def post_processing_function(examples, features, predictions, stage="eval", output_dir=None):
    # Post-processing: we match the start logits and end logits to answers in the original context.
    answer_column_name = "answers"  # TODO
    predictions = postprocess_qa_predictions(
        examples=examples,
        features=features,
        predictions=predictions,
        version_2_with_negative=False,
        n_best_size=20,
        max_answer_length=30,
        output_dir=output_dir,
    )

    formatted_predictions = [{"id": k, "prediction_text": v} for k, v in predictions.items()]

    references = [{"id": ex["id"], "answers": ex[answer_column_name]} for ex in examples]
    return EvalPrediction(predictions=formatted_predictions, label_ids=references)

In [4]:
class HuggingFaceModel:
    def __init__(self, model_name, examples, metric, model_path=None):
        self.model_name = model_name
        self.examples = examples
        self.metric = metric
        self.model_path = model_path

        self.tokenizer = AutoTokenizer.from_pretrained(
            model_name,
            max_length=384,
            truncation="only_second",
            stride=128,
            return_overflowing_tokens=True,
            return_offsets_mapping=True,
            # padding="max_length",
            use_fast=True,
        )

        if model_path is not None:
            self.model = OVAutoModelForQuestionAnswering.from_pretrained(
                model_path, config=AutoConfig.from_pretrained(model_name)
            )
        else:
            self.model = AutoModelForQuestionAnswering.from_pretrained(model_name).eval()

        if isinstance(self.model, torch.nn.Module):
            self.model_type = "PyTorch FP32"
        else:
            precision = "FP32"
            for param in self.model.net.get_ops():
                if "FakeQuantize" in param.name:
                    precision = "INT8"
                    break
            self.model_type = f"OpenVINO {precision}"
            self.ov_model_path = Path(self.model_path) / "ov_model.xml"

    def compute_metrics(self, p):
        return self.metric.compute(predictions=p.predictions, references=p.label_ids)

    def get_answer(question, context, reference=None, metric_type=None):
        input = tokenizer.encode_plus(
            question,
            context,
            return_tensors="pt",
            add_special_tokens=True,
        )
        if "PyTorch" in self.model_type:
            with torch.no_grad():
                result = model(**input, return_dict=True)
        else:
            result = model(**input, return_dict=True)

        answer_start_scores = result["start_logits"]
        answer_end_scores = result["end_logits"]

        # the list of all indices of words in question + context
        input_ids = input["input_ids"].tolist()[0]

        # Get the most likely beginning of answer with the argmax of the score
        answer_start = np.argmax(answer_start_scores)

        # Get the most likely end of answer with the argmax of the score
        answer_end = np.argmax(answer_end_scores) + 1
        answer = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(input_ids[answer_start:answer_end]))
        if metric_type is not None and reference is not None:
            metric = load_metric(metric_type)
            references = [{"id": 1, "answers": reference}]
            predictions = [{"id": 1, "prediction_text": answer}]
            metric_result = metric.compute(predictions=predictions, references=references)
            return answer, metric_result
        else:
            return answer

    def benchmark_loop(self, max_seq_length: int, padding: str, max_num_samples: int = 1000):
        """
        padding: "do_not_pad" or "max_length"
        """
        timings = []
        f1 = []
        em = []
        torch.set_grad_enabled(False)
        is_pytorch = "PyTorch" in self.model_type
        if max_seq_length is not None and not is_pytorch:
            model.use_dynamic_shapes = True

        max_num_samples = min(len(self.examples), max_num_samples)
        examples = self.examples.select(range(max_num_samples))

        warmup_question = "What is the prettiest color?"
        warmup_text = "Purple is the prettiest color"
        warmup_input = self.tokenizer.encode_plus(
            warmup_question,
            warmup_text,
            return_tensors="pt" if is_pytorch else "np",
            add_special_tokens=True,
        )
        model(**warmup_input, return_dict=False)

        for item in examples:
            input = self.tokenizer.encode_plus(
                item["question"],
                item["context"],
                return_tensors="pt",
                add_special_tokens=True,
                max_length=max_seq_length,
                padding=padding,
                truncation=True if max_seq_length is not None else False,
            )
            start_time = time.perf_counter()
            result = model(**input, return_dict=True)
            end_time = time.perf_counter()

            duration = end_time - start_time
            timings.append(duration * 1000)

            answer_start_scores = result["start_logits"]
            answer_end_scores = result["end_logits"]

            # the list of all indices of words in question + context
            input_ids = input["input_ids"].tolist()[0]

            # Get the most likely beginning of answer with the argmax of the score
            answer_start = np.argmax(answer_start_scores)

            # Get the most likely end of answer with the argmax of the score
            answer_end = np.argmax(answer_end_scores) + 1
            answer = tokenizer.convert_tokens_to_string(
                tokenizer.convert_ids_to_tokens(input_ids[answer_start:answer_end])
            )

            references = [{"id": item["id"], "answers": item["answers"]}]
            predictions = [{"id": item["id"], "prediction_text": answer}]
            metric_scores = self.metric.compute(references=references, predictions=predictions)

            f1.append(metric_scores["f1"])
            em.append(metric_scores["exact_match"])

        p50_latency = np.percentile(timings, 50).round(2)
        p90_latency = np.percentile(timings, 90).round(2)
        sps = round(len(examples) / (np.sum(timings) / 1000), 2)
        mean_f1 = np.mean(f1).round(2)
        mean_em = np.mean(em).round(2)

        result = {
            "model": self.model_type,
            "num_samples": max_num_samples,
            "seq_length": max_seq_length,
            "p50_latency": p50_latency,
            "p90_latency": p90_latency,
            "sps": sps,
            "mean_f1": mean_f1,
            "mean_em": mean_em,
        }

        return result

    def benchmark_app(self, device, seconds, api, input_shape):
        assert "OpenVINO" in self.model_type
        return benchmark_model(self.ov_model_path, device, seconds, api, input_shape)

    def benchmark_app_latency_throughput(self, device, seconds, input_shape):
        assert "OpenVINO" in self.model_type
        result = {}
        for hint in ("throughput", "latency"):
            benchmark_result = benchmark_model(self.ov_model_path, device, seconds, hint, input_shape)
            result[hint] = getattr(benchmark_result, hint, None)
        return result

    def evaluate_pytorch(self, dataset_train_i):

        output_path_fp32 = Path("output") / (model_name + "_fp32")
        output_path_fp32.mkdir(exist_ok=True)

        examples = self.examples

        train_examples = examples.select(range(0, dataset_train_items))
        validation_examples = examples.select(range(dataset_train_items, len(examples)))
        print(f"train examples: {len(train_examples)}, validation examples: {len(validation_examples)}")

        train_dataset = train_examples.map(
            lambda x: prepare_train_features(x, self.tokenizer, True),
            batched=True,
            num_proc=8,
            remove_columns=examples.column_names,
            load_from_cache_file=True,  # not data_args.overwrite_cache,
            desc="Running tokenizer on train dataset",
        )

        validation_dataset = validation_examples.map(
            lambda x: prepare_validation_features(x, self.tokenizer, True),
            batched=True,
            num_proc=8,
            remove_columns=examples.column_names,
            load_from_cache_file=True,  # not data_args.overwrite_cache,
            desc="Running tokenizer on validation dataset",
        )

        training_args = TrainingArguments(
            str(output_path_int8),
            do_train=True,
            local_rank=-1,
            num_train_epochs=num_epochs,
            per_device_train_batch_size=2,
            per_device_eval_batch_size=1,
        )

        model = AutoModelForQuestionAnswering.from_pretrained(self.model_name)

        data_collator = DataCollatorWithPadding(tokenizer)

        trainer_fp32 = QuestionAnsweringTrainer(
            model=model,
            args=training_args,
            train_dataset=train_dataset,  # train_dataset if training_args.do_train else None,
            eval_dataset=validation_dataset,
            eval_examples=validation_examples,  # if training_args.do_eval else None,
            tokenizer=self.tokenizer,
            data_collator=data_collator,
            post_process_function=lambda x, y, z: post_processing_function(
                x, y, z, output_dir=training_args.output_dir
            ),
            compute_metrics=self.compute_metrics,
        )
        fp32_metrics = trainer_fp32.evaluate()
        fp32_metrics["eval_samples"] = len(validation_examples)
        trainer_fp32.save_metrics("eval", fp32_metrics)
        trainer_fp32.save_state()

        training_args.output_dir = str(output_path_int8)

    def quantize(
        self,
        nncf_config_file,
        qat,
        num_epochs,
        dataset_filter=None,
        dataset_train_items=None,
    ):
        output_path_fp32 = Path("output") / (model_name + "_fp32")
        output_path_int8 = Path("output") / (model_name + "_int8")
        output_path_fp32.mkdir(exist_ok=True, parents=True)
        output_path_int8.mkdir(exist_ok=True)

        examples = self.examples

        if dataset_filter is not None:
            examples = self.examples.filter(lambda x: x["title"].startswith(dataset_filter))
            assert len(examples) > 0
        train_examples = examples.select(range(0, dataset_train_items))
        validation_examples = examples.select(range(dataset_train_items, len(examples)))
        print(f"train examples: {len(train_examples)}, validation examples: {len(validation_examples)}")

        train_dataset = train_examples.map(
            lambda x: prepare_train_features(x, self.tokenizer, True),
            batched=True,
            num_proc=8,
            remove_columns=examples.column_names,
            load_from_cache_file=True,  # not data_args.overwrite_cache,
            desc="Running tokenizer on train dataset",
        )

        validation_dataset = validation_examples.map(
            lambda x: prepare_validation_features(x, self.tokenizer, True),
            batched=True,
            num_proc=8,
            remove_columns=examples.column_names,
            load_from_cache_file=True,  # not data_args.overwrite_cache,
            desc="Running tokenizer on validation dataset",
        )

        training_args = TrainingArguments(
            str(output_path_int8),
            do_train=True,
            local_rank=-1,
            num_train_epochs=num_epochs,
            per_device_train_batch_size=2,
            per_device_eval_batch_size=1,
        )

        model = AutoModelForQuestionAnswering.from_pretrained(self.model_name)

        nncf_config = NNCFAutoConfig.from_json(nncf_config_file)
        pad_to_max_length = True
        data_collator = default_data_collator if pad_to_max_length else DataCollatorWithPadding(tokenizer)

        trainer = QuestionAnsweringTrainer(
            model=model,
            args=training_args,
            train_dataset=train_dataset,  # train_dataset if training_args.do_train else None,
            eval_dataset=validation_dataset,
            eval_examples=validation_examples,  # if training_args.do_eval else None,
            tokenizer=self.tokenizer,
            data_collator=data_collator,
            post_process_function=lambda x, y, z: post_processing_function(
                x, y, z, output_dir=training_args.output_dir
            ),
            compute_metrics=self.compute_metrics,
            nncf_config=nncf_config,
        )

        if qat:
            train_result = trainer.train()
            metrics = train_result.metrics
            metrics["train_samples"] = len(examples)
            trainer.save_metrics("train", metrics)
            trainer.save_state()
            trainer.save_model()  # Saves the tokenizer too for easy upload

        dataset_string = f"{examples._info.builder_name}_{examples._info.config_name}"

        tree = ET.parse(output_path_int8 / "ov_model.xml")
        root = tree.getroot()
        el = root.find("meta_data")
        el_nncf_parameters = ET.SubElement(el, "nncf_parameters")
        el_nncf_model_name = ET.SubElement(el_nncf_parameters, "model_name", value=model_name)
        el_nncf_config_file = ET.SubElement(el_nncf_parameters, "nncf_config_file", value=nncf_config)
        el_nncf_quantization_type = ET.SubElement(
            el_nncf_parameters, "quantization_type", value="qat" if qat else "ptq"
        )
        if qat:
            el_nncf_num_epochs = ET.SubElement(el_nncf_parameters, "num_epochs", value=str(num_epochs))
            el_nncf_dataset_size = ET.SubElement(el_nncf_parameters, "dataset_size", value=str(len(train_examples)))
            el_nncf_dataset_name = ET.SubElement(el_nncf_parameters, "dataset_name", value=dataset_string)
        tree.write(output_path_int8 / "ov_model.xml")

        print("*** Evaluate ***")
        metrics = trainer.evaluate()

        metrics["eval_samples"] = len(validation_examples)
        # print("FP32", fp32_metrics)
        print("INT8", metrics)
        trainer.save_metrics("eval", metrics)

### Load Dataset

NNCF needs a representative dataset to quantize the model. In this example we use the [squadshifts](https://github.com/huggingface/datasets/tree/master/datasets/squadshifts) dataset, and for demonstration purposes we filter that to only use questions related to the topic _Istanbul_. 

The next cell loads the dataset, filters it, and loads a metric to evaluate the model with this dataset.

In [5]:
ds = load_dataset("squadshifts", "new_wiki")["test"]
ds = ds.filter(lambda x: x["title"].startswith("Istanbul"))
metric = load_metric("squad")

Reusing dataset squadshifts (/home/ubuntu/.cache/huggingface/datasets/squadshifts/new_wiki/1.0.0/8303de6ce29bd28061c984dc50d04351a73bc3c344d5efe46f38b9948c2e3aca)
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 678.14it/s]
Loading cached processed dataset at /home/ubuntu/.cache/huggingface/datasets/squadshifts/new_wiki/1.0.0/8303de6ce29bd28061c984dc50d04351a73bc3c344d5efe46f38b9948c2e3aca/cache-4e86728829f37121.arrow


### Load Model

Create an instance of the `HuggingFaceModel` defined above, including dataset and metric. 

In [6]:
hf_model = HuggingFaceModel(model_name, ds, metric)

## Quantize

To quantize the model, we call the `.quantize()` method. We specify a path to a quantization config file, specify that we want to use quantization-aware-training (QAT) for two epochs with 150 items in the training set.

In [7]:
output_path_int8 = Path("output") / (model_name + "_int8")
if output_path_int8.exists():
    import os

    os.rename(output_path_int8, str(output_path_int8) + str(time.time()))
Path("output").mkdir(exist_ok=True, parents=True)

hf_model.quantize("../nncf_configs/nncf_roberta_config_squad.json", qat=True, num_epochs=2, dataset_train_items=150)

train examples: 150, validation examples: 227


Running tokenizer on train dataset #0: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 42.74ba/s]

Running tokenizer on train dataset #1: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 49.32ba/s][A


Running tokenizer on train dataset #2: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 47.88ba/s][A[A



Running tokenizer on train dataset #3: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 50.17ba/s][A[A[A




Running tokenizer on train dataset #4: 100%|████████████████████████████████████████████████████████████████████████

Statistics of the quantization algorithm:
+--------------------------------+-------+
|        Statistic's name        | Value |
| Ratio of enabled quantizations | 100   |
+--------------------------------+-------+

Statistics of the quantization share:
+----------------------------------+----------------------+
|         Statistic's name         |        Value         |
| Symmetric WQs / All placed WQs   | 100.00 % (148 / 148) |
+----------------------------------+----------------------+
| Asymmetric WQs / All placed WQs  | 0.00 % (0 / 148)     |
+----------------------------------+----------------------+
| Signed WQs / All placed WQs      | 100.00 % (148 / 148) |
+----------------------------------+----------------------+
| Unsigned WQs / All placed WQs    | 0.00 % (0 / 148)     |
+----------------------------------+----------------------+
| Per-tensor WQs / All placed WQs  | 100.00 % (148 / 148) |
+----------------------------------+----------------------+
| Per-channel WQs / All pla

Step,Training Loss


Statistics of the quantization algorithm:
+--------------------------------+-------+
|        Statistic's name        | Value |
| Ratio of enabled quantizations | 100   |
+--------------------------------+-------+

Statistics of the quantization share:
+----------------------------------+----------------------+
|         Statistic's name         |        Value         |
| Symmetric WQs / All placed WQs   | 100.00 % (148 / 148) |
+----------------------------------+----------------------+
| Asymmetric WQs / All placed WQs  | 0.00 % (0 / 148)     |
+----------------------------------+----------------------+
| Signed WQs / All placed WQs      | 100.00 % (148 / 148) |
+----------------------------------+----------------------+
| Unsigned WQs / All placed WQs    | 0.00 % (0 / 148)     |
+----------------------------------+----------------------+
| Per-tensor WQs / All placed WQs  | 100.00 % (148 / 148) |
+----------------------------------+----------------------+
| Per-channel WQs / All pla



Training completed. Do not forget to share your model on huggingface.co/models =)


Saving model checkpoint to output/aware-ai/roberta-large-squadv2_int8
Trainer.model is not a `PreTrainedModel`, only saving its state dict.
tokenizer config file saved in output/aware-ai/roberta-large-squadv2_int8/tokenizer_config.json
Special tokens file saved in output/aware-ai/roberta-large-squadv2_int8/special_tokens_map.json


Model Optimizer arguments:
Common parameters:
	- Path to the Input Model: 	/home/ubuntu/code/nlp/nncf_demo/output/aware-ai/roberta-large-squadv2_int8/ov_model.onnx
	- Path for generated IR: 	/home/ubuntu/code/nlp/nncf_demo/output/aware-ai/roberta-large-squadv2_int8
	- IR output name: 	ov_model
	- Log level: 	ERROR
	- Batch: 	Not specified, inherited from the model
	- Input layers: 	Not specified, inherited from the model
	- Output layers: 	Not specified, inherited from the model
	- Input shapes: 	Not specified, inherited from the model
	- Source layout: 	Not specified
	- Target layout: 	Not specified
	- Layout: 	Not specified
	- Mean values: 	Not specified
	- Scale values: 	Not specified
	- Scale factor: 	Not specified
	- Precision of IR: 	FP32
	- Enable fusing: 	True
	- User transformations: 	Not specified
	- Reverse input channels: 	False
	- Enable IR generation for fixed input shape: 	False
	- Use the transformations config file: 	None
Advanced parameters:
	- Force the usage of lega

The following columns in the evaluation set  don't have a corresponding argument in `NNCFNetwork.forward` and have been ignored: offset_mapping, example_id.
***** Running Evaluation *****
  Num examples = 232
  Batch size = 1


*** Evaluate ***


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 227/227 [00:00<00:00, 494.01it/s]


INT8 {'eval_exact_match': 71.80616740088105, 'eval_f1': 84.04614867295945, 'epoch': 2.0, 'eval_samples': 227}


In [8]:
def compute_metrics_f(p):
    metric = load_metric("squad")
    return metric.compute(predictions=p.predictions, references=p.label_ids)


model = AutoModelForQuestionAnswering.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

validation_examples = ds.select(range(150, len(ds)))
validation_dataset = validation_examples.map(
    lambda x: prepare_validation_features(x, tokenizer, True),
    batched=True,
    num_proc=8,
    remove_columns=validation_examples.column_names,
    load_from_cache_file=True, 
    desc="Running tokenizer on validation dataset",
)

training_args = TrainingArguments(
    f"output/{model_name}_fp32",
    do_train=True,
    local_rank=-1,
    num_train_epochs=1,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=1,
)

trainer_fp32 = QuestionAnsweringTrainer(
    model=model,
    args=training_args,
    train_dataset=validation_dataset,  # train_dataset if training_args.do_train else None,
    eval_dataset=validation_dataset,
    eval_examples=validation_examples,  # if training_args.do_eval else None,
    tokenizer=tokenizer,
    data_collator=default_data_collator,
    post_process_function=lambda x, y, z: post_processing_function(x, y, z, output_dir=training_args.output_dir),
    compute_metrics=compute_metrics_f,
)
fp32_metrics = trainer_fp32.evaluate()
print(fp32_metrics)
fp32_metrics["eval_samples"] = len(validation_examples)
trainer_fp32.save_metrics("eval", fp32_metrics)
# trainer_fp32.save_state()

loading configuration file https://huggingface.co/aware-ai/roberta-large-squadv2/resolve/main/config.json from cache at /home/ubuntu/.cache/huggingface/transformers/1d1ccba7034e300603775d5d45e255ed3389a5eeba26cec87c5315d533b3fb6e.8d033ace1855baf8cc69d210c6c36fd68e458b3818ab189a110fa98e54da80b5
Model config RobertaConfig {
  "_name_or_path": "aware-ai/roberta-large-squadv2",
  "architectures": [
    "RobertaForQuestionAnswering"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 1024,
  "initializer_range": 0.02,
  "intermediate_size": 4096,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 16,
  "num_hidden_layers": 24,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "transformers_version": "4.15.0",
  "type_vocab_size": 1,
  "use_cache": true,
  "vocab_size": 50265
}

loadi

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 227/227 [00:00<00:00, 507.74it/s]


{'eval_exact_match': 79.73568281938326, 'eval_f1': 91.36609818150431}


## Compare results

After quantization, the metric is shown. It is useful to also compare the results in more detail. The code in the following cell compares the inference results of the FP32 model with those of the INT8 model. 

### Compare Metric

Compare the evaluation metrics of the FP32 and INT8 model

### Compare Output

In [9]:
with open(f"output/{model_name}_fp32/predictions.json") as f:
    pt_preds = json.load(f)

with open(f"output/{model_name}_int8/predictions.json") as f:
    int_preds = json.load(f)

# dsf = ds.filter(lambda x: x["title"].startswith("Istanbul"))

questions = {item["id"]: item["question"] for item in ds}
answers = {item["id"]: item["answers"]["text"] for item in ds}
context = {item["id"]: item["context"] for item in ds}

data = []

for (pt_keys, pt_values), (int_keys, int_values) in zip(pt_preds.items(), int_preds.items()):
    try:
        if pt_values != int_values:
            data.append(
                [
                    questions[pt_keys],
                    answers[pt_keys],
                    context[pt_keys],
                    pt_values,
                    int_values,
                ]
            )
    #        print(questions[pt_keys], pt_values, int_values)
    except:
        pass

pd.set_option("display.max_rows", None)
pd.set_option("display.max_columns", None)
pd.set_option("display.width", None)
pd.set_option("display.max_colwidth", None)

df = pd.DataFrame(data=data, columns=["question", "answer", "context", "pytorch", "int8"])
df.to_csv(f"output/{model_name}_int8/aware_istanbul_fp32_int8.csv")

In [10]:
df

Unnamed: 0,question,answer,context,pytorch,int8
0,What brought about th,"[Galleria Ataköy, the historic to the modern]","Istanbul has numerous shopping centers, from the historic to the modern. The Grand Bazaar, in operation since 1461, is among the world's oldest and largest covered markets. Mahmutpasha Bazaar is an open-air market extending between the Grand Bazaar and the Egyptian Bazaar, which has been Istanbul's major spice market since 1660. Galleria Ataköy ushered in the age of modern shopping malls in Turkey when it opened in 1987. Since then, malls have become major shopping centers outside the historic peninsula. Akmerkez was awarded the titles of ""Europe's best"" and ""World's best"" shopping mall by the International Council of Shopping Centers in 1995 and 1996; Istanbul Cevahir has been one of the continent's largest since opening in 2005; Kanyon won the Cityscape Architectural Review Award in the Commercial Built category in 2006. İstinye Park in İstinye and Zorlu Center near Levent are among the newest malls which include the stores of the world's top fashion brands. Abdi İpekçi Street in Nişantaşı and Bağdat Avenue on the Anatolian side of the city have evolved into high-end shopping districts.",Galleria Ataköy ushered in the age of modern shopping malls in Turkey when it opened in 1987.,Galleria Ataköy
1,"When the religious system was replaced with a mayor and council, what was the new system modeled after?","[French cities, French cities, French cities]","The current city structure can be traced back to the Tanzimat period of reform in the 19th century, before which Islamic judges and imams led the city under the auspices of the Grand Vizier. Following the model of French cities, this religious system was replaced by a mayor and a citywide council composed of representatives of the confessional groups (millet) across Istanbul. Beyoğlu was the first area of the city to have its own director and council, with members instead being longtime residents of the neighborhood. Laws enacted after the Ottoman constitution of 1876 aimed to expand this structure across the city, imitating the twenty arrondissements of Paris, but they were not fully implemented until 1908, when Istanbul was declared a province with nine constituent districts. This system continued beyond the founding of the Turkish Republic, with the province renamed a belediye (municipality), but the municipality was disbanded in 1957.","French cities,",French cities
2,How many distinct areas is Istanbul broken into?,"[nine, nine, nine]","The current city structure can be traced back to the Tanzimat period of reform in the 19th century, before which Islamic judges and imams led the city under the auspices of the Grand Vizier. Following the model of French cities, this religious system was replaced by a mayor and a citywide council composed of representatives of the confessional groups (millet) across Istanbul. Beyoğlu was the first area of the city to have its own director and council, with members instead being longtime residents of the neighborhood. Laws enacted after the Ottoman constitution of 1876 aimed to expand this structure across the city, imitating the twenty arrondissements of Paris, but they were not fully implemented until 1908, when Istanbul was declared a province with nine constituent districts. This system continued beyond the founding of the Turkish Republic, with the province renamed a belediye (municipality), but the municipality was disbanded in 1957.",nine,nine constituent districts
3,"Before the Tanzimat reform, who was the head of Istanbul?","[the Grand Vizier, Islamic judges and imams, Islamic judges and imams]","The current city structure can be traced back to the Tanzimat period of reform in the 19th century, before which Islamic judges and imams led the city under the auspices of the Grand Vizier. Following the model of French cities, this religious system was replaced by a mayor and a citywide council composed of representatives of the confessional groups (millet) across Istanbul. Beyoğlu was the first area of the city to have its own director and council, with members instead being longtime residents of the neighborhood. Laws enacted after the Ottoman constitution of 1876 aimed to expand this structure across the city, imitating the twenty arrondissements of Paris, but they were not fully implemented until 1908, when Istanbul was declared a province with nine constituent districts. This system continued beyond the founding of the Turkish Republic, with the province renamed a belediye (municipality), but the municipality was disbanded in 1957.",Grand Vizier.,Grand Vizier
4,The Istanbul Special Provincial Administration is similar to what?,"[the MMI, MMI, MMI]","With the Istanbul Metropolitan Municipality and Istanbul Province having equivalent jurisdictions, few responsibilities remain for the provincial government. Similar to the MMI, the Istanbul Special Provincial Administration has a governor, a democratically elected decision-making body—the Provincial Parliament—and an appointed Executive Committee. Mirroring the executive committee at the municipal level, the Provincial Executive Committee includes a secretary-general and leaders of departments that advise the Provincial Parliament. The Provincial Administration's duties are largely limited to the building and maintenance of schools, residences, government buildings, and roads, and the promotion of arts, culture, and nature conservation. Hüseyin Avni Mutlu has been Governor of Istanbul Province since May 2010.","the MMI,",the MMI
5,What are one of the duties of the Provincial Administration?,"[building and maintenance of schools, building and maintenance of schools, building and maintenance of schools]","With the Istanbul Metropolitan Municipality and Istanbul Province having equivalent jurisdictions, few responsibilities remain for the provincial government. Similar to the MMI, the Istanbul Special Provincial Administration has a governor, a democratically elected decision-making body—the Provincial Parliament—and an appointed Executive Committee. Mirroring the executive committee at the municipal level, the Provincial Executive Committee includes a secretary-general and leaders of departments that advise the Provincial Parliament. The Provincial Administration's duties are largely limited to the building and maintenance of schools, residences, government buildings, and roads, and the promotion of arts, culture, and nature conservation. Hüseyin Avni Mutlu has been Governor of Istanbul Province since May 2010.","building and maintenance of schools, residences, government buildings, and roads, and the promotion of arts, culture, and nature conservation.","the building and maintenance of schools, residences, government buildings, and roads, and the promotion of arts, culture, and nature conservation"
6,By what year did Constantinople have the world's largest city?,"[500 CE, 500 CE, 500 CE]","Throughout most of its history, Istanbul has ranked among the largest cities in the world. By 500 CE, Constantinople had somewhere between 400,000 and 500,000 people, edging out its predecessor, Rome, for world's largest city. Constantinople jostled with other major historical cities, such as Baghdad, Chang'an, Kaifeng and Merv for the position of world's most populous city until the 12th century. It never returned to being the world's largest, but remained Europe's largest city from 1500 to 1750, when it was surpassed by London.","500 CE,",500 CE
7,What was one of the cities Constantinople jostled with for world's most populous city?,"[Baghdad, Baghdad, Baghdad]","Throughout most of its history, Istanbul has ranked among the largest cities in the world. By 500 CE, Constantinople had somewhere between 400,000 and 500,000 people, edging out its predecessor, Rome, for world's largest city. Constantinople jostled with other major historical cities, such as Baghdad, Chang'an, Kaifeng and Merv for the position of world's most populous city until the 12th century. It never returned to being the world's largest, but remained Europe's largest city from 1500 to 1750, when it was surpassed by London.",Baghdad,other major historical cities
8,From what years was Istanbul Europe's largest city?,"[1500 to 1750, 1500 to 1750,, 1500 to 1750]","Throughout most of its history, Istanbul has ranked among the largest cities in the world. By 500 CE, Constantinople had somewhere between 400,000 and 500,000 people, edging out its predecessor, Rome, for world's largest city. Constantinople jostled with other major historical cities, such as Baghdad, Chang'an, Kaifeng and Merv for the position of world's most populous city until the 12th century. It never returned to being the world's largest, but remained Europe's largest city from 1500 to 1750, when it was surpassed by London.","1500 to 1750,",1500 to 1750
9,What is remarkable in Istanbul?,"[historic seafood restaurants, its historic seafood restaurants, its historic seafood restaurants]","Aside from typical Turkish cuisine like kebab, Istanbul is also famous for its historic seafood restaurants. Many of the city's most popular and upscale seafood restaurants line the shores of the Bosphorus (particularly in neighborhoods like Ortaköy, Bebek, Arnavutköy, Yeniköy, Beylerbeyi and Çengelköy). Kumkapı along the Sea of Marmara has a pedestrian zone that hosts around fifty fish restaurants. The Princes' Islands, 15 kilometers (9 mi) from the city center, are also popular for their seafood restaurants. Because of their restaurants, historic summer mansions, and tranquil, car-free streets, the Princes' Islands are a popular vacation destination among Istanbulites and foreign tourists.",historic seafood restaurants.,historic seafood restaurants


## Benchmark

Compare the performance of the FP32 and INT8 models

In [11]:
print("FP32")
ov_fp32_path = f"output/{model_name}_fp32"
if not os.path.exists(f"{ov_fp32_path}/ov_model.xml"):
    ov_model = OVAutoModelForQuestionAnswering.from_pretrained(model_name, from_pt=True)
    ov_model.save_pretrained(ov_fp32_path)
hf_fp32_model = HuggingFaceModel(model_name, ds, metric, ov_fp32_path)
hf_fp32_model.benchmark_app_latency_throughput("CPU",10,"[1,128]")

FP32


loading configuration file https://huggingface.co/aware-ai/roberta-large-squadv2/resolve/main/config.json from cache at /home/ubuntu/.cache/huggingface/transformers/1d1ccba7034e300603775d5d45e255ed3389a5eeba26cec87c5315d533b3fb6e.8d033ace1855baf8cc69d210c6c36fd68e458b3818ab189a110fa98e54da80b5
Model config RobertaConfig {
  "_name_or_path": "aware-ai/roberta-large-squadv2",
  "architectures": [
    "RobertaForQuestionAnswering"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 1024,
  "initializer_range": 0.02,
  "intermediate_size": 4096,
  "layer_norm_eps": 1e-05,
  "max_length": 384,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 16,
  "num_hidden_layers": 24,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "transformers_version": "4.15.0",
  "type_vocab_size": 1,
  "use_cache": true,
  "vocab_

{'throughput': '27.57', 'latency': '47.56'}

In [12]:
print("INT8")
hf_int8_model = HuggingFaceModel(model_name, ds, metric, f"output/{model_name}_int8")
hf_int8_model.benchmark_app_latency_throughput("CPU",10,"[1,128]")

INT8


loading configuration file https://huggingface.co/aware-ai/roberta-large-squadv2/resolve/main/config.json from cache at /home/ubuntu/.cache/huggingface/transformers/1d1ccba7034e300603775d5d45e255ed3389a5eeba26cec87c5315d533b3fb6e.8d033ace1855baf8cc69d210c6c36fd68e458b3818ab189a110fa98e54da80b5
Model config RobertaConfig {
  "_name_or_path": "aware-ai/roberta-large-squadv2",
  "architectures": [
    "RobertaForQuestionAnswering"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 1024,
  "initializer_range": 0.02,
  "intermediate_size": 4096,
  "layer_norm_eps": 1e-05,
  "max_length": 384,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 16,
  "num_hidden_layers": 24,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "transformers_version": "4.15.0",
  "type_vocab_size": 1,
  "use_cache": true,
  "vocab_

{'throughput': '79.32', 'latency': '19.34'}

In [13]:
from IPython.display import HTML
HTML('''<script>
var code_show_err = false; 
var code_toggle_err = function() {
 var stderrNodes = document.querySelectorAll('[data-mime-type="application/vnd.jupyter.stderr"]')
 var stderr = Array.from(stderrNodes)
 if (code_show_err){
     stderr.forEach(ele => ele.style.display = 'block');
 } else {
     stderr.forEach(ele => ele.style.display = 'none');
 }
 code_show_err = !code_show_err
} 
document.addEventListener('DOMContentLoaded', code_toggle_err);
</script>
To toggle on/off output_stderr, click <a onclick="javascript:code_toggle_err()">here</a>.''')