# Introduction 

This tutorial demonstrates how to quantize a BERT model with both static and dynamic post training quantization based on [Intel® Neural Compressor](https://github.com/intel/neural-compressor) and benchmark the quantized models. For better int8 performance benefit, multi-instance benchmarking with 4 cores/instance is recommended.

# Prerequisite

## Install packages

* Follow [installation](https://github.com/intel/intel-extension-for-transformers#installation) to install **intel-extension-for-transformers**. 

In [None]:
# install model dependency
!pip install datasets>=1.1.3 sentencepiece!=0.1.92 protobuf torch>=1.10.0 transformers>=4.12.0 wandb

## Import packages

In [None]:
import datasets
import logging
import numpy as np
import os
import sys
import torch
import transformers
from dataclasses import dataclass, field
from datasets import load_dataset
from itertools import chain
from intel_extension_for_transformers.transformers import metrics, OptimizedModel, QuantizationConfig
from intel_extension_for_transformers.transformers.trainer import NLPTrainer
from transformers import (
    AutoConfig,
    AutoModelForMultipleChoice,
    AutoTokenizer,
    HfArgumentParser,
    TrainingArguments,
    default_data_collator,
    set_seed,
)
from transformers.tokenization_utils_base import PreTrainedTokenizerBase
from transformers.trainer_utils import get_last_checkpoint
from transformers.file_utils import PaddingStrategy
from typing import Optional, Union


logger = logging.getLogger(__name__)

os.environ["WANDB_DISABLED"] = "true"

## Define arguments

In [None]:
@dataclass
class ModelArguments:
    """
    Arguments pertaining to which model/config/tokenizer we are going to fine-tune from.
    """
    model_name_or_path: str = field(
        metadata={"help": "Path to pretrained model or model identifier from huggingface.co/models"}
    )


@dataclass
class DataTrainingArguments:
    """
    Arguments pertaining to what data we are going to input our model for training and eval.
    """
    pad_to_max_length: bool = field(
        default=False,
        metadata={
            "help": "Whether to pad all samples to the maximum sentence length. "
            "If False, will pad the samples dynamically when batching to the maximum length in the batch. More "
            "efficient on GPU but very bad for TPU."
        },
    )
    max_eval_samples: Optional[int] = field(
    default=None,
    metadata={
        "help": "For debugging purposes or quicker training, truncate the number of evaluation examples to this "
                "value if set."
        },
    )
    overwrite_cache: bool = field(
        default=False, metadata={"help": "Overwrite the cached training and evaluation sets"}
    )


@dataclass
class OptimizationArguments:
    """
    Arguments pertaining to what type of optimization we are going to apply on the model.
    """

    tune: bool = field(
        default=False,
        metadata={"help": "Whether or not to apply quantization."},
    )
    quantization_approach: Optional[str] = field(
        default="POSTTRAININGSTATIC",
        metadata={"help": "Quantization approach. Supported approach are POSTTRAININGSTATIC, "
                  "POSTTRAININGDYNAMIC and QUANTIZATIONAWARETRAINING."},
    )

In [None]:
model_args = ModelArguments(
    model_name_or_path="ehdwns1516/bert-base-uncased_SWAG",
)
data_args = DataTrainingArguments(
    pad_to_max_length=True,
    max_eval_samples=1000,
    overwrite_cache=True
)
training_args = TrainingArguments(
    output_dir="./saved_results_static",
    do_eval=True,
    do_train=True,
    no_cuda=True,
    overwrite_output_dir=True,
    per_device_eval_batch_size=8,
    per_device_train_batch_size=8
)
optim_args = OptimizationArguments(
    tune=True,
    quantization_approach="PostTrainingStatic"
)

## Download dataset from the hub

In [None]:
raw_datasets = load_dataset("swag", "regular")

## Download fp32 model from the hub

In [None]:
# Set seed before initializing model.
set_seed(training_args.seed)

# get fp32 model
config = AutoConfig.from_pretrained(model_args.model_name_or_path)
tokenizer = AutoTokenizer.from_pretrained(model_args.model_name_or_path)
model = AutoModelForMultipleChoice.from_pretrained(
    model_args.model_name_or_path,
    from_tf=bool(".ckpt" in model_args.model_name_or_path),
    config=config,
    use_auth_token=None
)
ending_names = [f"ending{i}" for i in range(4)]
context_name = "sent1"
question_header_name = "sent2"

## Preprocessing the dataset

In [None]:
# First we tokenize all the texts.
max_seq_length = tokenizer.model_max_length
if max_seq_length >1024:
    max_seq_length = 1024

# preprocessing the datasets
def preprocess_function(examples):
    first_sentences = [[context] * 4 for context in examples[context_name]]
    question_headers = examples[question_header_name]
    second_sentences = [
        [f"{header} {examples[end][i]}" for end in ending_names] for i, header in enumerate(question_headers)
    ]

    # Flatten out
    first_sentences = list(chain(*first_sentences))
    second_sentences = list(chain(*second_sentences))

    # Tokenize
    tokenized_examples = tokenizer(
        first_sentences,
        second_sentences,
        truncation=True,
        max_length=max_seq_length,
        padding="max_length" if data_args.pad_to_max_length else False,
    )
    # Un-flatten
    return {k: [v[i : i + 4] for i in range(0, len(v), 4)] for k, v in tokenized_examples.items()}

if training_args.do_train:
    if "train" not in raw_datasets:
        raise ValueError("--do_train requires a train dataset")
    train_dataset = raw_datasets["train"]
    with training_args.main_process_first(desc="train dataset map pre-processing"):
        train_dataset = train_dataset.map(preprocess_function, batched=True)
if training_args.do_eval:
    if "validation" not in raw_datasets:
        raise ValueError("--do_eval requires a validation dataset")
    eval_dataset = raw_datasets["validation"]
    if data_args.max_eval_samples is not None:
        eval_dataset = eval_dataset.select(range(data_args.max_eval_samples))
    with training_args.main_process_first(desc="validation dataset map pre-processing"):
        eval_dataset = eval_dataset.map(
            preprocess_function,
            batched=True,
            load_from_cache_file=not data_args.overwrite_cache,
        )

# Data collator
data_collator = default_data_collator

# Metric
def compute_metrics(eval_predictions):
    predictions, label_ids = eval_predictions
    preds = np.argmax(predictions, axis=1)
    return {"accuracy": (preds == label_ids).astype(np.float32).mean().item()}

# Quantization & Benchmark

## Static Post Training Quantization

In [None]:
set_seed(training_args.seed)
# Initialize our Trainer
trainer_static = NLPTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset if training_args.do_train else None,
    eval_dataset=eval_dataset if training_args.do_eval else None,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

# quantized model
tune_metric = metrics.Metric(
    name="eval_accuracy", # Metric used for the tuning strategy.
    is_relative=True, # Metric tolerance mode, True is for relative, otherwise for absolute.
    criterion="0.25", # Performance tolerance when optimizing the model.
)
quantization_config = QuantizationConfig(
    approach="PostTrainingStatic",
    metrics=[tune_metric],
)

# run quantization
trainer_static.quantize(quant_config=quantization_config)

# save quantized model
trainer_static.save_model("./saved_results_static")
model.config.save_pretrained("./saved_results_static")

## Run Benchmark after Static Post Training Quantization

In [None]:
set_seed(training_args.seed)
results = trainer_static.evaluate()
throughput_static = results.get("eval_samples_per_second")
eval_acc_static = results.get("eval_accuracy")
print('Batch size = {}'.format(training_args.per_device_eval_batch_size))
print("Finally Eval eval_accuracy Accuracy: {}".format(eval_acc_static))
print("Latency: {:.3f} ms".format(1000 / throughput_static))
print("Throughput: {} samples/sec".format(throughput_static))


## Run Benchmark after Static Post Training Quantization with Multi-Instance

In [None]:
import os
os.system('numactl --hardware')
results = os.system('bash ../multi_instance.sh --model=saved_results_static --core_per_instance=4 --data_type=int8')

## Dynamic Post Training Quantization

In [None]:
set_seed(training_args.seed)
training_args.output_dir = "saved_results_dynamic"
# Initialize our Trainer
trainer_dynamic = NLPTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset if training_args.do_train else None,
    eval_dataset=eval_dataset if training_args.do_eval else None,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

# quantized model
tune_metric = metrics.Metric(
    name="eval_accuracy", # Metric used for the tuning strategy.
    is_relative=True, # Metric tolerance mode, True is for relative, otherwise for absolute.
    criterion="0.25", # Performance tolerance when optimizing the model.
)
quantization_config = QuantizationConfig(
    approach="PostTrainingDynamic",
    metrics=[tune_metric],
)

# run quantization
trainer_dynamic.quantize(quant_config=quantization_config)

# save quantized model
trainer_dynamic.save_model("./saved_results_dynamic")

## Run Benchmark after Dynamic Post Training Quantization

In [None]:
set_seed(training_args.seed)
results = trainer_dynamic.evaluate()
throughput_dynamic = results.get("eval_samples_per_second")
eval_acc_dynamic = results.get("eval_accuracy")
print('Batch size = {}'.format(training_args.per_device_eval_batch_size))
print("Finally Eval eval_accuracy Accuracy: {}".format(eval_acc_dynamic))
print("Latency: {:.3f} ms".format(1000 / throughput_dynamic))
print("Throughput: {} samples/sec".format(throughput_dynamic))

## Run Benchmark after Dynamic Post Training Quantization with Multi-Instance

In [None]:
import os
results = os.system('bash ../multi_instance.sh --model=saved_results_dynamic --core_per_instance=4 --data_type=int8')

## Run Benchmark for FP32 model

In [None]:
# Initialize the Trainer
set_seed(training_args.seed)
trainer_fp32 = NLPTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset if training_args.do_train else None,
    eval_dataset=eval_dataset if training_args.do_eval else None,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)
results = trainer_fp32.evaluate()

throughput_fp32 = results.get("eval_samples_per_second")
eval_loss_fp32 = results["eval_accuracy"]
print('Batch size = {}'.format(training_args.per_device_eval_batch_size))
print("Finally Eval eval_loss Accuracy: {}".format(eval_loss_fp32))
print("Latency: {:.3f} ms".format(1000 / throughput_fp32))
print("Throughput: {} samples/sec".format(throughput_fp32))

## Run Benchmark for FP32 Model with Multi-Instance

In [None]:
import os
results = os.system('bash ../multi_instance.sh --model=ehdwns1516/bert-base-uncased_SWAG --core_per_instance=4 --data_type=fp32')