# Post Training Dynamic Quantization

In [1]:
import torch
import torch.nn as nn

# define a floating point model
class M(nn.Module):
  def __init__(self):
    super().__init__()
    self.fc = nn.Sequential(
      nn.Linear(8, 8),
      nn.Linear(8, 8)
    )

  def forward(self, x):
    x = self.fc(x)
    return x

input_fp32 = torch.randn(1, 8)

# create a model instance
model_fp32 = M()
result_fp32 = model_fp32(input_fp32)
# create a quantized model instance
model_int8 = torch.ao.quantization.quantize_dynamic(
  model_fp32,  # the original model
  dtype=torch.qint8,  # the target dtype for quantized weights
)

# run the model

result_int8 = model_int8(input_fp32)
print(result_fp32)
print(result_int8)
model_int8

tensor([[ 0.1401, -0.1008, -0.2679,  0.0947,  0.2827, -0.2916, -0.3086, -0.4235]],
       grad_fn=<AddmmBackward0>)
tensor([[ 0.1425, -0.1028, -0.2645,  0.0918,  0.2796, -0.2880, -0.3057, -0.4221]])


M(
  (fc): Sequential(
    (0): DynamicQuantizedLinear(in_features=8, out_features=8, dtype=torch.qint8, qscheme=torch.per_tensor_affine)
    (1): DynamicQuantizedLinear(in_features=8, out_features=8, dtype=torch.qint8, qscheme=torch.per_tensor_affine)
  )
)

# Post Training Static Quantization

In [2]:
# define a floating point model where some layers could be statically quantized
class M(nn.Module):
  def __init__(self):
    super().__init__()
    # QuantStub converts tensors from floating point to quantized
    self.quant = torch.ao.quantization.QuantStub()
    self.conv = nn.Conv2d(3, 32, 1, stride=2, bias=False)
    self.norm = nn.BatchNorm2d(32)
    self.relu = torch.nn.ReLU()
    # DeQuantStub converts tensors from quantized to floating point
    self.dequant = torch.ao.quantization.DeQuantStub()

  def forward(self, x):
    # manually specify where tensors will be converted from floating
    # point to quantized in the quantized model
    x = self.quant(x)
    x = self.conv(x)
    x = self.norm(x)
    x = self.relu(x)
    # manually specify where tensors will be converted from quantized
    # to floating point in the quantized model
    x = self.dequant(x)
    return x


# create a model instance
model_fp32 = M()

# model must be set to eval mode for static quantization logic to work
model_fp32.eval()

input_fp32 = torch.randn(1, 3, 32, 32)
result_fp32 = model_fp32(input_fp32)

# attach a global qconfig, which contains information about what kind
# of observers to attach. Use 'x86' for server inference and 'qnnpack'
# for mobile inference. Other quantization configurations such as selecting
# symmetric or asymmetric quantization and MinMax or L2Norm calibration techniques
# can be specified here.
# Note: the old 'fbgemm' is still available but 'x86' is the recommended default
# for server inference.
# model_fp32.qconfig = torch.ao.quantization.get_default_qconfig('fbgemm')
model_fp32.qconfig = torch.ao.quantization.get_default_qconfig('x86')

# Fuse the activations to preceding layers, where applicable.
# This needs to be done manually depending on the model architecture.
# Common fusions include `conv + relu` and `conv + batchnorm + relu`
model_fp32_fused = torch.ao.quantization.fuse_modules(model_fp32, [['conv', 'norm', 'relu']])

# Prepare the model for static quantization. This inserts observers in
# the model that will observe activation tensors during calibration.
model_fp32_prepared = torch.ao.quantization.prepare(model_fp32_fused)

# calibrate the prepared model to determine quantization parameters for activations
# in a real world setting, the calibration would be done with a representative dataset

result_fused = model_fp32_prepared(input_fp32)

# Convert the observed model to a quantized model. This does several things:
# quantizes the weights, computes and stores the scale and bias value to be
# used with each activation tensor, and replaces key operators with quantized
# implementations.
model_int8 = torch.ao.quantization.convert(model_fp32_prepared)

# run the model, relevant calculations will happen in int8
result_int8 = model_int8(input_fp32)

print(result_fp32)
print(result_fused)
print(result_int8)

tensor([[[[0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
          [1.1930, 0.0000, 0.4493,  ..., 0.0000, 1.0526, 0.0113],
          [1.6411, 0.3510, 0.0000,  ..., 0.0000, 0.7089, 0.4897],
          ...,
          [0.0000, 0.3751, 0.7795,  ..., 0.0000, 0.1216, 0.6287],
          [0.7933, 0.0000, 0.0000,  ..., 0.5304, 0.0000, 1.8350],
          [1.1003, 0.7213, 0.0000,  ..., 0.1184, 0.8121, 0.5085]],

         [[0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
          [0.7369, 0.0000, 0.0000,  ..., 0.0000, 0.4432, 0.0000],
          [0.8121, 0.1881, 0.0000,  ..., 0.0373, 0.6586, 0.3228],
          ...,
          [0.1242, 0.1278, 0.2615,  ..., 0.0000, 0.2960, 0.3051],
          [0.8318, 0.0139, 0.0000,  ..., 0.4018, 0.0000, 1.1432],
          [0.5474, 0.4446, 0.0091,  ..., 0.0000, 0.4757, 0.4384]],

         [[0.7539, 0.5231, 0.0000,  ..., 0.4106, 0.2636, 1.6610],
          [0.0000, 0.0000, 0.0000,  ..., 0.1882, 0.0000, 0.0000],
          [0.0000, 0.0056, 0.2654,  ..., 0



# Huggingface quantization

In [13]:
!optimum-cli export onnx --model gpt2 $HOME/onnx/gpt2_onnx/

Framework not specified. Using pt to export to ONNX.
Automatic task detection to causal-lm-with-past.
use_past = False is different than use_present_in_outputs = True, the value of use_present_in_outputs value will be used for the outputs.
Using framework PyTorch: 2.0.0a0+gitc263bd4
Overriding 2 configuration item(s)
	- use_cache -> True
	- pad_token_id -> 0
verbose: False, log level: Level.ERROR

Using framework PyTorch: 2.0.0a0+gitc263bd4
Overriding 2 configuration item(s)
	- use_cache -> True
	- pad_token_id -> 0
Asked a sequence length of 16, but a sequence length of 1 will be used with use_past == True for `input_ids`.
verbose: False, log level: Level.ERROR

Asked a sequence length of 16, but a sequence length of 1 will be used with use_past == True for `input_ids`.
Validating ONNX model /home/kiddos/onnx/gpt2_onnx/decoder_model_merged.onnx...
2023-04-09 08:02:58.517404572 [W:onnxruntime:, graph.cc:3490 CleanUnusedInitializersAndNodeArgs] Removing initializer '/transformer/h.11/at

In [15]:
!optimum-cli onnxruntime quantize --onnx_model $HOME/onnx/gpt2_onnx/ --avx512

Creating dynamic quantizer: QOperator (mode: IntegerOps, schema: u8/s8, channel-wise: False)
Quantizing model...
Saving quantized model at: /home/kiddos/onnx/gpt2_onnx (external data format: False)
Configuration saved in /home/kiddos/onnx/gpt2_onnx/ort_config.json
Creating dynamic quantizer: QOperator (mode: IntegerOps, schema: u8/s8, channel-wise: False)
Quantizing model...
Saving quantized model at: /home/kiddos/onnx/gpt2_onnx (external data format: False)
Configuration saved in /home/kiddos/onnx/gpt2_onnx/ort_config.json
Creating dynamic quantizer: QOperator (mode: IntegerOps, schema: u8/s8, channel-wise: False)
Quantizing model...
Saving quantized model at: /home/kiddos/onnx/gpt2_onnx (external data format: False)
Configuration saved in /home/kiddos/onnx/gpt2_onnx/ort_config.json


In [17]:
from optimum.onnxruntime import ORTQuantizer, ORTModelForCausalLM
import os

model_path = os.path.join(os.getenv('HOME'), 'onnx', 'gpt2_onnx', 'decoder_with_past_model_quantized.onnx')
onnx_model = ORTModelForCausalLM.load_model(model_path)

In [18]:
onnx_model

(<onnxruntime.capi.onnxruntime_inference_collection.InferenceSession at 0x7effe1671700>,
 None)

In [20]:
llama_path = os.path.join(os.getenv('HOME'), 'models', 'llama', '7B', 'consolidated.00.pth')
llama = torch.load(llama_path)

In [22]:
print(llama)

{'tok_embeddings.weight': tensor([[ 9.8884e-05, -2.3329e-04,  5.8460e-04,  ..., -3.4237e-04,
          5.9724e-05, -1.1957e-04],
        [ 1.5289e-02, -1.2154e-02,  1.2512e-02,  ...,  1.3092e-02,
          7.2174e-03, -6.8045e-04],
        [ 1.7433e-03,  1.7633e-03, -1.4465e-02,  ..., -1.1444e-02,
         -1.2665e-02,  3.7289e-04],
        ...,
        [-9.0179e-03,  3.0807e-02, -1.6708e-02,  ..., -1.2680e-02,
          1.0437e-02,  4.2343e-03],
        [-1.1368e-02, -1.4801e-02, -3.5667e-03,  ...,  6.5308e-03,
         -2.2263e-02, -6.1455e-03],
        [-1.3992e-02,  1.6985e-03, -2.1469e-02,  ...,  1.3527e-02,
          2.8290e-02, -8.9111e-03]], dtype=torch.float16), 'norm.weight': tensor([1.8760, 1.5547, 1.6357,  ..., 1.7070, 1.6543, 1.5713],
       dtype=torch.float16), 'output.weight': tensor([[-0.0145,  0.0008,  0.0043,  ..., -0.0016, -0.0147, -0.0086],
        [ 0.0200, -0.0425,  0.0167,  ..., -0.0190, -0.0644,  0.0189],
        [ 0.0211,  0.0158,  0.0270,  ...,  0.0338,  0.01

In [21]:
llama_int8 = torch.ao.quantization.quantize_dynamic(
  llama,  # the original model
  dtype=torch.qint8,  # the target dtype for quantized weights
)

In [4]:
from transformers import AutoModelForCausalLM

model_name = 'chainyo/alpaca-lora-7b'
model = AutoModelForCausalLM.from_pretrained(model_name, load_in_4bit=True, device_map='auto')