# Introduction 

This tutorial demonstrates how to quantize a BERT model with both static and dynamic post training quantization based on [Intel® Neural Compressor](https://github.com/intel/neural-compressor) and benchmark the quantized models. For better int8 performance benefit, multi-instance benchmarking with 4 cores/instance is recommended.

# Prerequisite

## Install packages

* Follow [installation](https://github.com/intel/intel-extension-for-transformers#installation) to install **intel-extension-for-transformers**. 

In [None]:
# install model dependency
! pip install accelerate seqeval datasets >= 1.8.0 torch >= 1.10 transformers>=4.12.0 wandb

## Import packages

In [None]:
import logging
import numpy as np
import os
import sys
import transformers
from dataclasses import dataclass, field
from datasets import ClassLabel, load_dataset, load_metric
from intel_extension_for_transformers.transformers import(
    metrics,
    OptimizedModel,
    QuantizationConfig,
)
from intel_extension_for_transformers.transformers.trainer import NLPTrainer
from transformers import (
    AutoConfig,
    AutoModelForTokenClassification,
    AutoTokenizer,
    DataCollatorForTokenClassification,
    HfArgumentParser,
    PretrainedConfig,
    PreTrainedTokenizerFast,
    TrainingArguments,
    set_seed,
)
from transformers.trainer_utils import get_last_checkpoint
from transformers.utils import check_min_version
from transformers.utils.versions import require_version
from typing import Optional


# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
check_min_version("4.12.0")

require_version("datasets>=1.8.0", "To fix: pip install -r examples/huggingface/pytorch/token-classification/quantization/requirements.txt")

logger = logging.getLogger(__name__)

os.environ["WANDB_DISABLED"] = "true"

## Define arguments

In [None]:
@dataclass
class ModelArguments:
    """
    Arguments pertaining to which model/config/tokenizer we are going to fine-tune from.
    """

    model_name_or_path: str = field(
        metadata={"help": "Path to pretrained model or model identifier from huggingface.co/models"}
    )

@dataclass
class DataTrainingArguments:
    """
    Arguments pertaining to what data we are going to input our model for training and eval.
    """

    task_name: Optional[str] = field(default="ner", metadata={"help": "The name of the task (ner, pos...)."})
    dataset_name: Optional[str] = field(
        default=None, metadata={"help": "The name of the dataset to use (via the datasets library)."}
    )
    overwrite_cache: bool = field(
        default=False, metadata={"help": "Overwrite the cached training and evaluation sets"}
    )
    pad_to_max_length: bool = field(
        default=False,
        metadata={
            "help": "Whether to pad all samples to model maximum sentence length. "
            "If False, will pad the samples dynamically when batching to the maximum length in the batch. More "
            "efficient on GPU but very bad for TPU."
        },
    )
    max_train_samples: Optional[int] = field(
        default=None,
        metadata={
            "help": "For debugging purposes or quicker training, truncate the number of training examples to this "
            "value if set."
        },
    )
    max_eval_samples: Optional[int] = field(
        default=None,
        metadata={
            "help": "For debugging purposes or quicker training, truncate the number of evaluation examples to this "
            "value if set."
        },
    )

    def __post_init__(self):
        self.task_name = self.task_name.lower()

@dataclass
class OptimizationArguments:
    """
    Arguments pertaining to what type of optimization we are going to apply on the model.
    """
    tune: bool = field(
        default=False,
        metadata={"help": "Whether or not to apply quantization."},
    )


In [None]:
model_args = ModelArguments(
    model_name_or_path="elastic/distilbert-base-uncased-finetuned-conll03-english",
)
data_args = DataTrainingArguments(
    dataset_name="conll2003",
    overwrite_cache=True,
    max_eval_samples=1000,
    pad_to_max_length=True
)
training_args = TrainingArguments(
    output_dir="./saved_results_static",
    do_eval=True,
    do_train=True,
    no_cuda=True,
    overwrite_output_dir=True,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
)
optim_args = OptimizationArguments(
    tune=True
)

## Download dataset from the hub

In [None]:
# Downloading and loading a dataset from the hub.
raw_datasets = load_dataset("conll2003")

column_names = raw_datasets["train"].column_names
features = raw_datasets["train"].features
text_column_name = "tokens"
label_column_name = f"{data_args.task_name}_tags"

# In the event the labels are not a `Sequence[ClassLabel]`, we will need to go through the dataset to get the
# unique labels.
def get_label_list(labels):
    unique_labels = set()
    for label in labels:
        unique_labels = unique_labels | set(label)
    label_list = list(unique_labels)
    label_list.sort()
    return label_list

# If the labels are of type ClassLabel, they are already integers and we have the map stored somewhere.
# Otherwise, we have to get the list of labels manually.
labels_are_int = isinstance(features[label_column_name].feature, ClassLabel)
if labels_are_int:
    label_list = features[label_column_name].feature.names
    label_to_id = {i: i for i in range(len(label_list))}
else:
    label_list = get_label_list(raw_datasets["train"][label_column_name])
    label_to_id = {l: i for i, l in enumerate(label_list)}

num_labels = len(label_list)

## Download fp32 model from the hub

In [None]:
# Load pretrained model and tokenizer
# download model & vocab.
config = AutoConfig.from_pretrained(
    model_args.model_name_or_path,
    num_labels=num_labels,
    finetuning_task="ner",
    revision="main",
)

tokenizer_name_or_path = model_args.model_name_or_path
if config.model_type in {"gpt2", "roberta"}:
    tokenizer = AutoTokenizer.from_pretrained(
        tokenizer_name_or_path,
        use_fast=True,
        revision="main",
        add_prefix_space=True,
    )
else:
    tokenizer = AutoTokenizer.from_pretrained(
        tokenizer_name_or_path,
        use_fast=True,
        revision="main",
    )
# Load the model obtained after Intel Neural Compressor (INC) quantization
model = AutoModelForTokenClassification.from_pretrained(
    model_args.model_name_or_path,
    from_tf=bool(".ckpt" in model_args.model_name_or_path),
    config=config,
    revision="main"
)

# Model has labels -> use them.
if model.config.label2id != PretrainedConfig(num_labels=num_labels).label2id:
    if list(sorted(model.config.label2id.keys())) == list(sorted(label_list)):
        # Reorganize `label_list` to match the ordering of the model.
        if labels_are_int:
            label_to_id = {i: int(model.config.label2id[l]) for i, l in enumerate(label_list)}
            label_list = [model.config.id2label[i] for i in range(num_labels)]
        else:
            label_list = [model.config.id2label[i] for i in range(num_labels)]
            label_to_id = {l: i for i, l in enumerate(label_list)}
    else:
        logger.warning(
            "Your model seems to have been trained with labels, but they don't match the dataset: ",
            f"model labels: {list(sorted(model.config.label2id.keys()))}, dataset labels: {list(sorted(label_list))}."
            "\nIgnoring the model labels as a result.",
        )

# Set the correspondences label/ID inside the model config
model.config.label2id = {l: i for i, l in enumerate(label_list)}
model.config.id2label = {i: l for i, l in enumerate(label_list)}

# Map that sends B-Xxx label to its I-Xxx counterpart
b_to_i_label = []
for idx, label in enumerate(label_list):
    if label.startswith("B-") and label.replace("B-", "I-") in label_list:
        b_to_i_label.append(label_list.index(label.replace("B-", "I-")))
    else:
        b_to_i_label.append(idx)

## Preprocessing the dataset

In [None]:
# Padding strategy
padding = "max_length"

# Tokenize all texts and align the labels with them.
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples[text_column_name],
        padding=padding,
        truncation=True,
        # We use this argument because the texts in our dataset are lists of words (with a label for each word).
        is_split_into_words=True,
    )
    labels = []
    for i, label in enumerate(examples[label_column_name]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            # Special tokens have a word id that is None. We set the label to -100 so they are automatically
            # ignored in the loss function.
            if word_idx is None:
                label_ids.append(-100)
            # We set the label for the first token of each word.
            elif word_idx != previous_word_idx:
                label_ids.append(label_to_id[label[word_idx]])
            # For the other tokens in a word, we set the label to either the current label or -100, depending on
            # the label_all_tokens flag.
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx

        labels.append(label_ids)
    tokenized_inputs["labels"] = labels
    return tokenized_inputs

# train dataset
train_dataset = raw_datasets["train"]
if data_args.max_train_samples is not None:
    train_dataset = train_dataset.select(range(data_args.max_train_samples))
with training_args.main_process_first(desc="train dataset map pre-processing"):
    train_dataset = train_dataset.map(
        tokenize_and_align_labels,
        batched=True,
        load_from_cache_file=not data_args.overwrite_cache,
        desc="Running tokenizer on train dataset",
    )

# evaluation dataset
eval_dataset = raw_datasets["validation"]
if data_args.max_eval_samples is not None:
    eval_dataset = eval_dataset.select(range(data_args.max_eval_samples))
with training_args.main_process_first(desc="validation dataset map pre-processing"):
    eval_dataset = eval_dataset.map(
        tokenize_and_align_labels,
        batched=True,
        load_from_cache_file=not data_args.overwrite_cache,
        desc="Running tokenizer on validation dataset",
    )

# Data collator
data_collator = DataCollatorForTokenClassification(tokenizer)

# Metrics
metric = load_metric("seqeval")
metric_name = "eval_f1"

def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    # Remove ignored index (special tokens)
    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }


# Quantization & Benchmark

## Static Post Training Quantization

In [None]:
# Set seed before initializing model.
set_seed(training_args.seed)
# Initialize our Trainer
trainer_static = NLPTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset if training_args.do_train else None,
    eval_dataset=eval_dataset if training_args.do_eval else None,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

# tuning
model.config.save_pretrained("./saved_results_static")
trainer_static.save_model("./saved_results_static")
tune_metric = metrics.Metric(
    name=metric_name, is_relative=True, criterion=0.25
)
quantization_config = QuantizationConfig(
    approach="PostTrainingStatic",
    metrics=[tune_metric],
)
trainer_static.quantize(quantization_config)

## Run Benchmark after Static Post Training Quantization

In [None]:
results_static = trainer_static.evaluate()
throughput_static = results_static.get("eval_samples_per_second")
print('Batch size = {}'.format(training_args.per_device_eval_batch_size))
print("Finally Eval {} Accuracy: {}".format(metric_name, results_static[metric_name]))
print("Latency: {:.3f} ms".format(1000 / throughput_static))
print("Throughput: {} samples/sec".format(throughput_static))

## Run Benchmark after Static Post Training Quantization with Multi-Instance

In [None]:
import os
os.system('numactl --hardware')
results = os.system('bash ../multi_instance.sh --model=saved_results_static --core_per_instance=4 --data_type=int8')

## Dynamic Post Training Quantization

In [None]:
# Set seed before initializing model.
set_seed(training_args.seed)
training_args.output_dir = "saved_results_dynamic"
# Initialize our Trainer
trainer_dynamic = NLPTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset if training_args.do_train else None,
    eval_dataset=eval_dataset if training_args.do_eval else None,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

# tuning
model.config.save_pretrained("./saved_results_dynamic")
trainer_dynamic.save_model("./saved_results_dynamic")
tune_metric = metrics.Metric(
    name=metric_name, is_relative=True, criterion=0.25
)
quantization_config = QuantizationConfig(
    approach="PostTrainingDynamic",
    metrics=[tune_metric],
)
trainer_dynamic.quantize(quantization_config)

## Run Benchmark after Dynamic Post Training Quantization

In [None]:
results_dynamic = trainer_dynamic.evaluate()
throughput_dynamic = results_dynamic.get("eval_samples_per_second")
print('Batch size = {}'.format(training_args.per_device_eval_batch_size))
print("Finally Eval {} Accuracy: {}".format(metric_name, results_dynamic[metric_name]))
print("Latency: {:.3f} ms".format(1000 / throughput_dynamic))
print("Throughput: {} samples/sec".format(throughput_dynamic))

## Run Benchmark after Dynamic Post Training Quantization with Multi-Instance

In [None]:
import os
results = os.system('bash ../multi_instance.sh --model=saved_results_dynamic --core_per_instance=4 --data_type=int8')

## Run Benchmark for FP32 model

In [None]:
# Set seed before initializing model.
set_seed(training_args.seed)
# Initialize our Trainer
trainer_fp32 = NLPTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset if training_args.do_train else None,
    eval_dataset=eval_dataset if training_args.do_eval else None,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

results_fp32 = trainer_fp32.evaluate()
throughput_fp32 = results_fp32.get("eval_samples_per_second")
print('Batch size = {}'.format(training_args.per_device_eval_batch_size))
print("Finally Eval {} Accuracy: {}".format(metric_name, results_fp32[metric_name]))
print("Latency: {:.3f} ms".format(1000 / throughput_fp32))
print("Throughput: {} samples/sec".format(throughput_fp32))

## Run Benchmark for FP32 Model with Multi-Instance

In [None]:
import os
results = os.system('bash ../multi_instance.sh --model=elastic/distilbert-base-uncased-finetuned-conll03-english --core_per_instance=4 --data_type=fp32')