# Introduction 

This tutorial demonstrates how to quantize a BERT model with both static and dynamic post training quantization based on [Intel® Neural Compressor](https://github.com/intel/neural-compressor) and benchmark the quantized models. For better int8 performance benefit, multi-instance benchmarking with 4 cores/instance is recommended.

# Prerequisite

## Install packages

* Follow [installation](https://github.com/intel/intel-extension-for-transformers#installation) to install **intel-extension-for-transformers**. 

In [None]:
# install model dependency
!pip install accelerate datasets>=1.8.0 sentencepiece!=0.1.92 torch>=1.10.0 wandb

## Import packages

In [None]:
import datasets
import logging
import os
import sys
import transformers
from dataclasses import dataclass, field
from datasets import load_dataset, load_metric
from itertools import chain
from intel_extension_for_transformers.transformers import metrics, OptimizedModel, QuantizationConfig
from intel_extension_for_transformers.transformers.trainer import NLPTrainer
from transformers import (
    CONFIG_MAPPING,
    MODEL_FOR_MASKED_LM_MAPPING,
    AutoConfig,
    AutoModelForMaskedLM,
    AutoTokenizer,
    DataCollatorForLanguageModeling,
    TrainingArguments,
    is_torch_tpu_available,
    set_seed,
)
from transformers.trainer_utils import get_last_checkpoint
from transformers.utils.versions import require_version
from typing import Optional

os.environ["WANDB_DISABLED"] = "true"

logger = logging.getLogger(__name__)
MODEL_CONFIG_CLASSES = list(MODEL_FOR_MASKED_LM_MAPPING.keys())
MODEL_TYPES = tuple(conf.model_type for conf in MODEL_CONFIG_CLASSES)

## Define arguments

In [None]:
@dataclass
class ModelArguments:
    """
    Arguments pertaining to which model/config/tokenizer we are going to fine-tune, or train from scratch.
    """

    model_name_or_path: Optional[str] = field(
        default=None,
        metadata={
            "help": "The model checkpoint for weights initialization."
            "Don't set if you want to train a model from scratch."
        },
    )


@dataclass
class DataTrainingArguments:
    """
    Arguments pertaining to what data we are going to input our model for training and eval.
    """

    dataset_name: Optional[str] = field(
        default=None, metadata={"help": "The name of the dataset to use (via the datasets library)."}
    )
    dataset_config_name: Optional[str] = field(
        default=None, metadata={"help": "The configuration name of the dataset to use (via the datasets library)."}
    )
    mlm_probability: float = field(
        default=0.15, metadata={"help": "Ratio of tokens to mask for masked language modeling loss"}
    )
                    

In [None]:
model_args = ModelArguments(
    model_name_or_path="bert-base-uncased", 
)
data_args = DataTrainingArguments(
    dataset_name="wikitext",
    dataset_config_name="wikitext-2-raw-v1",
)
training_args = TrainingArguments(
    output_dir="./saved_results_static",
    do_eval=True,
    do_train=True,
    no_cuda=True,
    per_device_eval_batch_size=1,
    overwrite_output_dir=True
)

## Download dataset from the hub

In [None]:
raw_datasets = load_dataset(
    data_args.dataset_name, data_args.dataset_config_name
)

## Download fp32 model from the hub

In [None]:
# Set seed before initializing model.
set_seed(training_args.seed)

# get fp32 model
config = AutoConfig.from_pretrained(model_args.model_name_or_path)
tokenizer = AutoTokenizer.from_pretrained(model_args.model_name_or_path)
model = AutoModelForMaskedLM.from_pretrained(
    model_args.model_name_or_path,
    config=config
)
model.resize_token_embeddings(len(tokenizer))

## Preprocessing the dataset

In [None]:
# First we tokenize all the texts.
if training_args.do_train:
    column_names = raw_datasets["train"].column_names
else:
    column_names = raw_datasets["validation"].column_names
text_column_name = "text" if "text" in column_names else column_names[0]

max_seq_length = tokenizer.model_max_length
def tokenize_function(examples):
    return tokenizer(examples[text_column_name], return_special_tokens_mask=True)

column_names = raw_datasets["train"].column_names
text_column_name = "text" if "text" in column_names else column_names[0]
    
with training_args.main_process_first(desc="dataset map tokenization"):
    tokenized_datasets = raw_datasets.map(
        tokenize_function,
        batched=True,
        remove_columns=column_names,
        load_from_cache_file=True,
        desc="Running tokenizer on every text in dataset",
    )

# Main data processing function that will concatenate all texts from our dataset and generate chunks of max_seq_length.
def group_texts(examples):
    # Concatenate all texts.
    concatenated_examples = {k: list(chain(*examples[k])) for k in examples.keys()}
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
    # customize this part to your needs.
    if total_length >= max_seq_length:
        total_length = (total_length // max_seq_length) * max_seq_length
    # Split by chunks of max_len.
    result = {
        k: [t[i : i + max_seq_length] for i in range(0, total_length, max_seq_length)]
        for k, t in concatenated_examples.items()
    }
    return result

# Note that with `batched=True`, this map processes 1,000 texts together, so group_texts throws away a
# remainder for each of those groups of 1,000 texts. You can adjust that batch_size here but a higher value
# might be slower to preprocess.

with training_args.main_process_first(desc="grouping texts together"):
    tokenized_datasets = tokenized_datasets.map(
        group_texts,
        batched=True,
        load_from_cache_file=True,
        desc=f"Grouping texts in chunks of {max_seq_length}",
    )

if training_args.do_train:
    if "train" not in tokenized_datasets:
        raise ValueError("--do_train requires a train dataset")
    train_dataset = tokenized_datasets["train"]

if training_args.do_eval:
    if "validation" not in tokenized_datasets:
        raise ValueError("--do_eval requires a validation dataset")
    eval_dataset = tokenized_datasets["validation"]

    def preprocess_logits_for_metrics(logits, labels):
        if isinstance(logits, tuple):
            # Depending on the model and config, logits may contain extra tensors,
            # like past_key_values, but logits always come first
            logits = logits[0]
        return logits.argmax(dim=-1)

    metric = load_metric("accuracy")

    def compute_metrics(eval_preds):
        preds, labels = eval_preds
        # preds have the same shape as the labels, after the argmax(-1) has been calculated
        # by preprocess_logits_for_metrics
        labels = labels.reshape(-1)
        preds = preds.reshape(-1)
        mask = labels != -100
        labels = labels[mask]
        preds = preds[mask]
        return metric.compute(predictions=preds, references=labels)


# Data collator will take care of randomly masking the tokens.
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm_probability=data_args.mlm_probability,
    pad_to_multiple_of=None,
)

# Quantization & Benchmark

## Static Post Training Quantization

In [None]:
# Initialize the Trainer
set_seed(training_args.seed)
trainer_ptq_static = NLPTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset if training_args.do_train else None,
    eval_dataset=eval_dataset if training_args.do_eval else None,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics if training_args.do_eval and not is_torch_tpu_available() else None,
    preprocess_logits_for_metrics=preprocess_logits_for_metrics
    if training_args.do_eval and not is_torch_tpu_available()
    else None,
)

tune_metric = metrics.Metric(
    name="eval_loss", # Metric used for the tuning strategy.
    is_relative=True, # Metric tolerance mode, True is for relative, otherwise for absolute.
    criterion="0.25", # Performance tolerance when optimizing the model.
    greater_is_better=False 
)
quantization_config = QuantizationConfig(
    approach="PostTrainingStatic",
    metrics=[tune_metric],
)

# run quantization
trainer_ptq_static.quantize(quant_config=quantization_config)

# save quantized model
trainer_ptq_static.save_model("./saved_results_static")
model.config.save_pretrained("./saved_results_static")

## Run Benchmark after Static Post Training Quantization

In [None]:
set_seed(training_args.seed)
results = trainer_ptq_static.evaluate()
bert_task_acc_keys = ['eval_loss', 'eval_f1', 'eval_accuracy', 'eval_matthews_correlation',
                              'eval_pearson', 'eval_mcc', 'eval_spearmanr']

throughput_ptq_static = results.get("eval_samples_per_second")
eval_loss_ptq_static = results["eval_loss"]
print('Batch size = {}'.format(training_args.per_device_eval_batch_size))
print("Finally Eval eval_loss Accuracy: {}".format(eval_loss_ptq_static))
print("Latency: {:.3f} ms".format(1000 / throughput_ptq_static))
print("Throughput: {} samples/sec".format(throughput_ptq_static))

## Run Benchmark after Static Post Training Quantization with Multi-Instance

In [None]:
import os
os.system('numactl --hardware')
results = os.system('bash ../multi_instance.sh --model=saved_results_static --core_per_instance=4 --data_type=int8')

## Dynamic Post Training Quantization

In [None]:
# Initialize the Trainer
set_seed(training_args.seed)
training_args.output_dir = "./saved_results_dynamic"
trainer_ptq_dynamic = NLPTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset if training_args.do_train else None,
    eval_dataset=eval_dataset if training_args.do_eval else None,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics if training_args.do_eval and not is_torch_tpu_available() else None,
    preprocess_logits_for_metrics=preprocess_logits_for_metrics
    if training_args.do_eval and not is_torch_tpu_available()
    else None,
)

tune_metric = metrics.Metric(
    name="eval_loss", 
    is_relative=True,
    criterion="0.25", # why performance tolerance
    greater_is_better=False
)
quantization_config = QuantizationConfig(
    approach="PostTrainingDynamic",
    metrics=[tune_metric],
)

# run quantization
trainer_ptq_dynamic.quantize(quant_config=quantization_config)

# save quantized model
trainer_ptq_dynamic.save_model("./saved_results_dynamic")

## Run Benchmark after Dynamic Post Training Quantization

In [None]:
set_seed(training_args.seed)
results = trainer_ptq_dynamic.evaluate()

throughput_ptq_dynamic = results.get("eval_samples_per_second")
eval_loss_ptq_dynamic = results["eval_loss"]
print('Batch size = {}'.format(training_args.per_device_eval_batch_size))
print("Finally Eval eval_loss Accuracy: {}".format(eval_loss_ptq_dynamic))
print("Latency: {:.3f} ms".format(1000 / throughput_ptq_dynamic))
print("Throughput: {} samples/sec".format(throughput_ptq_dynamic))

## Run Benchmark after Dynamic Post Training Quantization with Multi-Instance

In [None]:
import os
results = os.system('bash ../multi_instance.sh --model=saved_results_dynamic --core_per_instance=4 --data_type=int8')

## Run Benchmark for FP32 model

In [None]:
# Initialize the Trainer
set_seed(training_args.seed)
trainer = NLPTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset if training_args.do_train else None,
    eval_dataset=eval_dataset if training_args.do_eval else None,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics if training_args.do_eval and not is_torch_tpu_available() else None,
    preprocess_logits_for_metrics=preprocess_logits_for_metrics
    if training_args.do_eval and not is_torch_tpu_available()
    else None,
)
results = trainer.evaluate()

throughput_fp32 = results.get("eval_samples_per_second")
eval_loss_fp32 = results["eval_loss"]
print('Batch size = {}'.format(training_args.per_device_eval_batch_size))
print("Finally Eval eval_loss Accuracy: {}".format(eval_loss_fp32))
print("Latency: {:.3f} ms".format(1000 / throughput_fp32))
print("Throughput: {} samples/sec".format(throughput_fp32))

## Run Benchmark for FP32 Model with Multi-Instance

In [None]:
import os
results = os.system('bash ../multi_instance.sh --model=bert-base-uncased --core_per_instance=4 --data_type=fp32')