# Introduction 

This tutorial demonstrates how to quantize a BERT model with both static and dynamic post training quantization based on [Intel® Neural Compressor](https://github.com/intel/neural-compressor) and benchmark the quantized models. For better int8 performance benefit, multi-instance benchmarking with 4 cores/instance is recommended.

# Prerequisite

## Install packages

* Follow [installation](https://github.com/intel/intel-extension-for-transformers#installation) to install **intel-extension-for-transformers**. 

In [None]:
# install model dependency
! pip install accelerate datasets >= 1.1.3 sentencepiece != 0.1.92 protobuf torch >= 1.10 transformers >= 4.12.0 wandb

## Import packages

In [None]:
import logging
import numpy as np
import os
import random
import transformers
from dataclasses import dataclass, field
from datasets import load_dataset, load_metric
from intel_extension_for_transformers.transformers import metrics, objectives, OptimizedModel, QuantizationConfig
from intel_extension_for_transformers.transformers.trainer import NLPTrainer
from transformers import (
    AutoConfig,
    AutoModelForSequenceClassification,
    AutoTokenizer,
    DataCollatorWithPadding,
    EvalPrediction,
    HfArgumentParser,
    PretrainedConfig,
    TrainingArguments,
    default_data_collator,
    set_seed,
)
from transformers.trainer_utils import get_last_checkpoint
from transformers.utils import check_min_version
from typing import Optional

os.environ["CUDA_VISIBLE_DEVICES"] = ""
os.environ["WANDB_DISABLED"] = "true"

# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
check_min_version("4.12.0")

task_to_keys = {
    "cola": ("sentence", None),
    "mnli": ("premise", "hypothesis"),
    "mrpc": ("sentence1", "sentence2"),
    "qnli": ("question", "sentence"),
    "qqp": ("question1", "question2"),
    "rte": ("sentence1", "sentence2"),
    "sst2": ("sentence", None),
    "stsb": ("sentence1", "sentence2"),
    "wnli": ("sentence1", "sentence2"),
}

logger = logging.getLogger(__name__)

## Define arguments

In [None]:
# ========== Define arguments =========
@dataclass
class DataTrainingArguments:
    """
    Arguments pertaining to what data we are going to input our model for training and eval.

    Using `HfArgumentParser` we can turn this class
    into argparse arguments to be able to specify them on
    the command line.
    """
    task_name: Optional[str] = field(
        default=None,
        metadata={"help": "The name of the task to train on: " + ", ".join(task_to_keys.keys())},
    )
    dataset_name: Optional[str] = field(
        default=None, metadata={"help": "The name of the dataset to use (via the datasets library)."}
    )
    max_seq_length: int = field(
        default=128,
        metadata={
            "help": "The maximum total input sequence length after tokenization. Sequences longer "
            "than this will be truncated, sequences shorter will be padded."
        },
    )
    overwrite_cache: bool = field(
        default=False, metadata={"help": "Overwrite the cached preprocessed datasets or not."}
    )
    train_file: Optional[str] = field(
        default=None, metadata={"help": "A csv or a json file containing the training data."}
    )
    validation_file: Optional[str] = field(
        default=None, metadata={"help": "A csv or a json file containing the validation data."}
    )

    def __post_init__(self):
        if self.task_name is not None:
            self.task_name = self.task_name.lower()
            if self.task_name not in task_to_keys.keys():
                raise ValueError("Unknown task, you should pick one in " + ",".join(task_to_keys.keys()))
        elif self.dataset_name is not None:
            pass
        elif self.train_file is None or self.validation_file is None:
            raise ValueError("Need either a GLUE task, a training/validation file or a dataset name.")
        else:
            train_extension = self.train_file.split(".")[-1]
            assert train_extension in ["csv", "json"], "`train_file` should be a csv or a json file."
            validation_extension = self.validation_file.split(".")[-1]
            assert (
                validation_extension == train_extension
            ), "`validation_file` should have the same extension (csv or json) as `train_file`."


@dataclass
class ModelArguments:
    """
    Arguments pertaining to which model/config/tokenizer we are going to fine-tune from.
    """
    model_name_or_path: str = field(
        metadata={"help": "Path to pretrained model or model identifier from huggingface.co/models"}
    )


@dataclass
class OptimizationArguments:
    """
    Arguments pertaining to what type of optimization we are going to apply on the model.
    """
    tune: bool = field(
        default=False,
        metadata={"help": "Whether or not to apply quantization."},
    )
    quantization_approach: Optional[str] = field(
        default="PostTrainingStatic",
        metadata={"help": "Quantization approach. Supported approach are PostTrainingStatic, "
                  "PostTrainingDynamic and QuantizationAwareTraining."},
    )
    is_relative: Optional[bool] = field(
        default=True,
        metadata={"help": "Metric tolerance model, expected to be relative or absolute."},
    )
    perf_tol: Optional[float] = field(
        default=0.01,
        metadata={"help": "Performance tolerance when optimizing the model."},
    )
    benchmark: bool = field(
        default=False,
        metadata={"help": "run benchmark."})
    int8: bool = field(
        default=False,
        metadata={"help":"run benchmark."})


In [None]:
model_args = ModelArguments(
    model_name_or_path="textattack/bert-base-uncased-MRPC",
)
data_args = DataTrainingArguments(
    task_name="mrpc",
    max_seq_length=128,
    overwrite_cache=True
)
training_args = TrainingArguments(
    output_dir="./saved_result_static",
    do_eval=True,
    do_train=True,
    no_cuda=True,
    overwrite_output_dir=True,
    per_device_train_batch_size=8,
)
optim_args = OptimizationArguments(
    tune=True,
    quantization_approach="PostTrainingStatic"
)
log_level = training_args.get_process_log_level()
logger.setLevel(log_level)

## Download dataset from the hub

In [None]:
# download the dataset.
raw_datasets = load_dataset("glue", data_args.task_name)
# Labels
label_list = raw_datasets["train"].features["label"].names
num_labels = len(label_list)

## Download fp32 model from the hub

In [None]:
# Load pretrained model and tokenizer
config = AutoConfig.from_pretrained(
    model_args.model_name_or_path,
    num_labels=num_labels,
    finetuning_task=data_args.task_name,
    revision="main"
)
tokenizer = AutoTokenizer.from_pretrained(
    model_args.model_name_or_path,
    use_fast=True,
    revision="main"
)
model = AutoModelForSequenceClassification.from_pretrained(
    model_args.model_name_or_path,
    from_tf=bool(".ckpt" in model_args.model_name_or_path),
    config=config,
    revision="main"
)

## Preprocessing the dataset

In [None]:
# Preprocessing the raw_datasets
sentence1_key, sentence2_key = task_to_keys[data_args.task_name]
# Padding strategy
padding = False
# Some models have set the order of the labels to use, so let's make sure we do use it.
label_to_id = None
if model.config.label2id != PretrainedConfig(num_labels=num_labels).label2id:
    # Some have all caps in their config, some don't.
    label_name_to_id = {k.lower(): v for k, v in model.config.label2id.items()}
    if list(sorted(label_name_to_id.keys())) == list(sorted(label_list)):
        label_to_id = {i: int(label_name_to_id[label_list[i]]) for i in range(num_labels)}
    else:
        logger.warning(
            f"Your model seems to have been trained with labels, but they don't match the dataset: "
            f"model labels: {list(sorted(label_name_to_id.keys()))}, dataset labels: {list(sorted(label_list))}.\n"
            f"Ignoring the model labels as a result."
        )
if label_to_id is not None:
    model.config.label2id = label_to_id
    model.config.id2label = {id: label for label, id in config.label2id.items()}
max_seq_length = min(data_args.max_seq_length, tokenizer.model_max_length)

def preprocess_function(examples):
    # Tokenize the texts
    args = (
        (examples[sentence1_key],) if sentence2_key is None else (examples[sentence1_key], examples[sentence2_key])
    )
    result = tokenizer(*args, padding=padding, max_length=max_seq_length, truncation=True)

    # Map labels to IDs (not necessary for GLUE tasks)
    if label_to_id is not None and "label" in examples:
        result["label"] = [(label_to_id[l] if l != -1 else -1) for l in examples["label"]]
    return result

with training_args.main_process_first(desc="dataset map pre-processing"):
    raw_datasets = raw_datasets.map(
        preprocess_function, batched=True, load_from_cache_file=not data_args.overwrite_cache
    )

if training_args.do_train:
    if "train" not in raw_datasets:
        raise ValueError("--do_train requires a train dataset")
    train_dataset = raw_datasets["train"]

if training_args.do_eval:
    if "validation" not in raw_datasets and "validation_matched" not in raw_datasets:
        raise ValueError("--do_eval requires a validation dataset")
    eval_dataset = raw_datasets["validation_matched" if data_args.task_name == "mnli" else "validation"]

# Log a few random samples from the training set:
if training_args.do_train:
    for index in random.sample(range(len(train_dataset)), 3):
        logger.info(f"Sample {index} of the training set: {train_dataset[index]}.")

# Get the metric function
metric = load_metric("glue", data_args.task_name)

metric_name = "eval_accuracy"

# You can define your custom compute_metrics function. It takes an `EvalPrediction` object (a namedtuple with a
# predictions and label_ids field) and has to return a dictionary string to float.
def compute_metrics(p: EvalPrediction):
    preds = p.predictions[0] if isinstance(p.predictions, tuple) else p.predictions
    preds =  np.argmax(preds, axis=1)
    if data_args.task_name is not None:
        result = metric.compute(predictions=preds, references=p.label_ids)
        if len(result) > 1:
            result["combined_score"] = np.mean(list(result.values())).item()
        return result
    else:
        return {"accuracy": (preds == p.label_ids).astype(np.float32).mean().item()}

# Data collator will default to DataCollatorWithPadding, so we change it if we already did the padding.
data_collator = None

# Quantization & Benchmark

## Static Post Training Quantization

In [None]:
# Set seed before initializing model.
set_seed(training_args.seed)
# Initialize our Trainer
trainer_static = NLPTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset if training_args.do_train else None,
    eval_dataset=eval_dataset if training_args.do_eval else None,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer,
    data_collator=data_collator,
)
# tuning
if not training_args.do_eval:
    raise ValueError("do_eval must be set to True for quantization.")

model.config.save_pretrained("./saved_results_static")
trainer_static.save_model("./saved_results_static")

tune_metric = metrics.Metric(
    name=metric_name, is_relative=True, criterion=0.25
)
objective = objectives.performance
quantization_config = QuantizationConfig(
    approach="PostTrainingStatic",
    max_trials=600,
    metrics=[tune_metric],
    objectives=[objective]
)
trainer_static.quantize(quant_config=quantization_config)

## Run Benchmark after Static Post Training Quantization

In [None]:
results = trainer_static.evaluate()
throughput = results.get("eval_samples_per_second")
eval_acc = results.get("eval_accuracy")
print('Batch size = {}'.format(training_args.per_device_eval_batch_size))
print("Finally Eval eval_accuracy Accuracy: {:.5f}".format(eval_acc))
print("Latency: {:.5f} ms".format(1000 / throughput))
print("Throughput: {:.5f} samples/sec".format(throughput))

## Run Benchmark after Static Post Training Quantization with Multi-Instance

In [None]:
import os
os.system('numactl --hardware')
results = os.system('bash ../multi_instance.sh --model=saved_results_static --core_per_instance=4 --data_type=int8')

## Dynamic Post Training Quantization

In [None]:
# Set seed before initializing model.
set_seed(training_args.seed)
training_args.output_dir = "saved_results_dynamic"
# Initialize our Trainer
trainer_dynamic = NLPTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset if training_args.do_train else None,
    eval_dataset=eval_dataset if training_args.do_eval else None,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer,
    data_collator=data_collator,
)

# tuning
if not training_args.do_eval:
    raise ValueError("do_eval must be set to True for quantization.")

model.config.save_pretrained("./saved_results_dynamic")
trainer_dynamic.save_model("./saved_results_dynamic")

tune_metric = metrics.Metric(
    name=metric_name, is_relative=True, criterion=0.25
)
objective = objectives.performance
quantization_config = QuantizationConfig(
    approach="PostTrainingDynamic",
    max_trials=600,
    metrics=[tune_metric],
    objectives=[objective]
)
trainer_dynamic.quantize(quant_config=quantization_config)

## Run Benchmark after Dynamic Post Training Quantization

In [None]:
results = trainer_dynamic.evaluate()
throughput = results.get("eval_samples_per_second")
eval_acc = results.get("eval_accuracy")
print('Batch size = {}'.format(training_args.per_device_eval_batch_size))
print("Finally Eval eval_accuracy Accuracy: {:.5f}".format(eval_acc))
print("Latency: {:.5f} ms".format(1000 / throughput))
print("Throughput: {:.5f} samples/sec".format(throughput))

## Run Benchmark after Dynamic Post Training Quantization with Multi-Instance

In [None]:
import os
results = os.system('bash ../multi_instance.sh --model=saved_results_dynamic --core_per_instance=4 --data_type=int8')

## Run Benchmark for FP32 model

In [None]:
set_seed(training_args.seed)
trainer_fp32 = NLPTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset if training_args.do_train else None,
    eval_dataset=eval_dataset if training_args.do_eval else None,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer,
    data_collator=data_collator,
)
results = trainer_fp32.evaluate()

throughput = results.get("eval_samples_per_second")
eval_acc = results.get("eval_accuracy")
print('Batch size = {}'.format(training_args.per_device_eval_batch_size))
print("Finally Eval eval_accuracy Accuracy: {:.5f}".format(eval_acc))
print("Latency: {:.5f} ms".format(1000 / throughput))
print("Throughput: {:.5f} samples/sec".format(throughput))

## Run Benchmark for FP32 Model with Multi-Instance

In [None]:
import os
results = os.system('bash ../multi_instance.sh --model=lvwerra/pegasus-samsum --core_per_instance=4 --data_type=fp32')