# Introduction 

This tutorial demonstrates how to quantize a T5 model with dynamic post training quantization based on [Intel® Neural Compressor](https://github.com/intel/neural-compressor) and benchmark the quantized models. For better int8 performance benefit, multi-instance benchmarking with 4 cores/instance is recommended.

# Prerequisite

## Install packages

* Follow [installation](https://github.com/intel/intel-extension-for-transformers#installation) to install **intel-extension-for-transformers**. 

In [None]:
# install model dependency
!pip install accelerate datasets >= 1.8 sentencepiece != 0.1.92 protobuf sacrebleu >= 1.4.12 py7zr torch >= 1.10 transformers>=4.19.0.dev0

## Import packages

In [None]:
import logging
import os
import sys
from dataclasses import dataclass, field
from typing import Optional

import datasets
import numpy as np
from datasets import load_dataset, load_metric

from intel_extension_for_transformers.transformers import OptimizedModel, QuantizationConfig
from intel_extension_for_transformers.transformers import metrics as nlp_metrics
from intel_extension_for_transformers.transformers.trainer import NLPSeq2SeqTrainer
import transformers
from transformers import (
    AutoConfig,
    AutoModelForSeq2SeqLM,
    AutoTokenizer,
    DataCollatorForSeq2Seq,
    HfArgumentParser,
    M2M100Tokenizer,
    MBart50Tokenizer,
    MBart50TokenizerFast,
    MBartTokenizer,
    MBartTokenizerFast,
    Seq2SeqTrainer,
    Seq2SeqTrainingArguments,
    default_data_collator,
    set_seed,
)
from transformers.trainer_utils import get_last_checkpoint
from transformers.utils import check_min_version
from transformers.utils.versions import require_version


# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
check_min_version("4.19.0.dev0")

require_version("datasets>=1.8.0", "To fix: pip install -r examples/huggingface/pytorch/translation/quantization/requirements.txt")

logger = logging.getLogger(__name__)

# A list of all multilingual tokenizer which require src_lang and tgt_lang attributes.
MULTILINGUAL_TOKENIZERS = [MBartTokenizer, MBartTokenizerFast, MBart50Tokenizer, MBart50TokenizerFast, M2M100Tokenizer]


## Define arguments

In [None]:
@dataclass
class ModelArguments:
    """
    Arguments pertaining to which model/config/tokenizer we are going to fine-tune from.
    """

    model_name_or_path: str = field(
        metadata={"help": "Path to pretrained model or model identifier from huggingface.co/models"}
    )


@dataclass
class DataTrainingArguments:
    """
    Arguments pertaining to what data we are going to input our model for training and eval.
    """

    source_lang: str = field(default=None, metadata={"help": "Source language id for translation."})
    target_lang: str = field(default=None, metadata={"help": "Target language id for translation."})

    dataset_name: Optional[str] = field(
        default=None, metadata={"help": "The name of the dataset to use (via the datasets library)."}
    )
    dataset_config_name: Optional[str] = field(
        default=None, metadata={"help": "The configuration name of the dataset to use (via the datasets library)."}
    )
    overwrite_cache: bool = field(
        default=False, metadata={"help": "Overwrite the cached training and evaluation sets"}
    )
    max_train_samples: Optional[int] = field(
        default=None,
        metadata={
            "help": "For debugging purposes or quicker training, truncate the number of training examples to this "
            "value if set."
        },
    )
    max_eval_samples: Optional[int] = field(
        default=None,
        metadata={
            "help": "For debugging purposes or quicker training, truncate the number of evaluation examples to this "
            "value if set."
        },
    )
    source_prefix: Optional[str] = field(
        default=None, metadata={"help": "A prefix to add before every source text (useful for T5 models)."}
    )

@dataclass
class OptimizationArguments:
    """
    Arguments pertaining to what type of optimization we are going to apply on the model.
    """
    tune: bool = field(
        default=False,
        metadata={"help": "Whether or not to apply quantization."},
    )


In [None]:
model_args = ModelArguments(
    model_name_or_path="t5-small",
)
data_args = DataTrainingArguments(
    source_lang="en",
    target_lang="ro",
    dataset_name="wmt16",
    dataset_config_name="ro-en",
    overwrite_cache=True,
    max_eval_samples=400,
    source_prefix="translate English to Romanian: "
)
training_args = Seq2SeqTrainingArguments(
    output_dir="./saved_results_dynamic",
    do_eval=True,
    do_train=True,
    no_cuda=True,
    overwrite_output_dir=True,
    per_device_eval_batch_size=8,
    predict_with_generate=True
)
optim_args = OptimizationArguments(
    tune=True,
)

## Download dataset from the hub

In [None]:
raw_datasets = load_dataset("wmt16", "ro-en")

## Download fp32 model from the hub

In [None]:
# Set seed before initializing model.
set_seed(training_args.seed)

# download model & vocab.
config = AutoConfig.from_pretrained(
    "t5-small",
    revision="main"
)
tokenizer = AutoTokenizer.from_pretrained(
    "t5-small",
    revision="main",
    use_fast=True
)
model = AutoModelForSeq2SeqLM.from_pretrained(
    "t5-small",
    from_tf=bool(".ckpt" in model_args.model_name_or_path),
    config=config,
    revision="main"
)

model.resize_token_embeddings(len(tokenizer))

prefix = ""

## Preprocessing the dataset

In [None]:
# We need to tokenize inputs and targets.
column_names = raw_datasets["train"].column_names

# Get the language codes for input/target.
source_lang = data_args.source_lang.split("_")[0]
target_lang = data_args.target_lang.split("_")[0]

# Temporarily set max_target_length for training.
max_target_length = 128
padding = False

def preprocess_function(examples):
    inputs = [ex[source_lang] for ex in examples["translation"]]
    targets = [ex[target_lang] for ex in examples["translation"]]
    inputs = [prefix + inp for inp in inputs]
    model_inputs = tokenizer(inputs, max_length=1024, padding=padding, truncation=True)

    # Setup the tokenizer for targets
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length=max_target_length, padding=padding, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

# define train dataset
train_dataset = raw_datasets["train"]
if data_args.max_train_samples is not None:
    max_train_samples = min(len(train_dataset), data_args.max_train_samples)
    train_dataset = train_dataset.select(range(max_train_samples))
with training_args.main_process_first(desc="train dataset map pre-processing"):
    train_dataset = train_dataset.map(
        preprocess_function,
        batched=True,
        remove_columns=column_names,
        load_from_cache_file=not data_args.overwrite_cache,
        desc="Running tokenizer on train dataset",
    )

# define eval dataset
eval_dataset = raw_datasets["validation"]
if data_args.max_eval_samples is not None:
    max_eval_samples = min(len(eval_dataset), data_args.max_eval_samples)
    eval_dataset = eval_dataset.select(range(max_eval_samples))
with training_args.main_process_first(desc="validation dataset map pre-processing"):
    eval_dataset = eval_dataset.map(
        preprocess_function,
        batched=True,
        remove_columns=column_names,
        load_from_cache_file=not data_args.overwrite_cache,
        desc="Running tokenizer on validation dataset",
    )

# Data collator
label_pad_token_id = -100
data_collator = DataCollatorForSeq2Seq(
    tokenizer,
    model=model,
    label_pad_token_id=label_pad_token_id,
    pad_to_multiple_of=8 if training_args.fp16 else None,
)

# Metric
metric = load_metric("sacrebleu")

def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [[label.strip()] for label in labels]

    return preds, labels

def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Some simple post-processing
    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)

    result = metric.compute(predictions=decoded_preds, references=decoded_labels)
    result = {"bleu": result["score"]}

    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
    result["gen_len"] = np.mean(prediction_lens)
    result = {k: round(v, 4) for k, v in result.items()}
    return result

metric_name = "eval_bleu"
max_length = 128
num_beams = None

# Quantization & Benchmark

## Dynamic Post Training Quantization

In [None]:
# Set seed before initializing model.
set_seed(training_args.seed)
# Initialize our Trainer
trainer = NLPSeq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset if training_args.do_train else None,
    eval_dataset=eval_dataset if training_args.do_eval else None,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics if training_args.predict_with_generate else None,
)


# # tuning
model.config.save_pretrained(training_args.output_dir)
trainer.save_model(training_args.output_dir)

tune_metric = nlp_metrics.Metric(
    name=metric_name, is_relative=True, criterion=0.25
)
quantization_config = QuantizationConfig(
    approach="PostTrainingDynamic",
    max_trials=200,
    metrics=[tune_metric],
)
trainer.max_length = max_length
trainer.num_beams = num_beams
trainer.quantize(quant_config=quantization_config)

## Run Benchmark after Dynamic Post Training Quantization

In [None]:
results = trainer.evaluate(max_length=max_length, num_beams=num_beams)
throughput = results.get("eval_samples_per_second")
print('Batch size = {:d}'.format(training_args.per_device_eval_batch_size))
print("Finally Eval {} Accuracy: {:.5f}".format(metric_name, results[metric_name]))
print("Latency: {:.5f} ms".format(1000 / throughput))
print("Throughput: {:.5f} samples/sec".format(throughput))

## Run Benchmark after Dynamic Post Training Quantization with Multi-Instance

In [None]:
import os
results = os.system('bash ../multi_instance.sh --model=saved_results_dynamic --core_per_instance=4 --data_type=int8')

## Run Benchmark for FP32 model

In [None]:
# Set seed before initializing model.
set_seed(training_args.seed)
# Initialize our Trainer
trainer_fp32 = NLPSeq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset if training_args.do_train else None,
    eval_dataset=eval_dataset if training_args.do_eval else None,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)
results_fp32 = trainer_fp32.evaluate(max_length=max_length, num_beams=num_beams)
throughput_fp32 = results_fp32.get("eval_samples_per_second")
print('Batch size = {:d}'.format(training_args.per_device_eval_batch_size))
print("Finally Eval {} Accuracy: {:.5f}".format(metric_name, results_fp32[metric_name]))
print("Latency: {:.5f} ms".format(1000 / throughput_fp32))
print("Throughput: {:.5f} samples/sec".format(throughput_fp32))

## Run Benchmark for FP32 Model with Multi-Instance

In [None]:
import os
results = os.system('bash ../multi_instance.sh --model=t5-small --core_per_instance=4 --data_type=fp32')