<a href="https://colab.research.google.com/github/imakshatt/Deep-Learning-Concepts/blob/main/Quantization.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import torch
from torch import nn
import torch.nn.functional as F
import warnings
warnings.filterwarnings('ignore')
import os

In [None]:
class LeNet5(nn.Module):
  def __init__(self):
    super(LeNet5, self).__init__()
    self.conv1 = nn.Conv2d(3,6,5)
    self.conv2 = nn.Conv2d(6,16,15)
    self.fc1 = nn.Linear(16*5*5, 120)
    self.fc2 = nn.Linear(120, 84)
    self.fc3 = nn.Linear(84,10)

  def forward(self, x):
    x = F.max_pool2d(
        F.relu(self.conv1(x)), (2,2)
    )
    x = F.max_pool2d(
        F.relu(self.conv2(x)), 2
    )
    x = x.view(-1, int(x.nelement() / x.shape[0]))
    x = F.relu(self.fc1(x))
    x = F.relu(self.fc2(x))
    x = self.fc3(x)
    return x

fp32_model = LeNet5()
fp32_model

LeNet5(
  (conv1): Conv2d(3, 6, kernel_size=(5, 5), stride=(1, 1))
  (conv2): Conv2d(6, 16, kernel_size=(15, 15), stride=(1, 1))
  (fc1): Linear(in_features=400, out_features=120, bias=True)
  (fc2): Linear(in_features=120, out_features=84, bias=True)
  (fc3): Linear(in_features=84, out_features=10, bias=True)
)

By default, all computation and memory are implemented as float32

In [None]:
for n,p in fp32_model.named_parameters():
  print(n, ": ", p.dtype)

conv1.weight :  torch.float32
conv1.bias :  torch.float32
conv2.weight :  torch.float32
conv2.bias :  torch.float32
fc1.weight :  torch.float32
fc1.bias :  torch.float32
fc2.weight :  torch.float32
fc2.bias :  torch.float32
fc3.weight :  torch.float32
fc3.bias :  torch.float32


Now Reduce the model which is fp32_model into half precision meaans convert FP32 to FP16

In [None]:
fp16_model = fp32_model.half()

for n,p in fp16_model.named_parameters():
  print(n, ": ", p.dtype)

conv1.weight :  torch.float16
conv1.bias :  torch.float16
conv2.weight :  torch.float16
conv2.bias :  torch.float16
fc1.weight :  torch.float16
fc1.bias :  torch.float16
fc2.weight :  torch.float16
fc2.bias :  torch.float16
fc3.weight :  torch.float16
fc3.bias :  torch.float16


Now Let's explore three modes of Quantization

1.   Post-training Dynamic Quantization
2.   Post-training Static Quantization
3.   Quantization Aware Training



# Post-training Dynamic Quantization


*   weights are Quantized (means FP32 to int8)
*   Activations are Stored and read in FP32, Quantized temporarily during
    calculation



In [None]:
import torch.quantization

quantized_model = torch.quantization.quantize_dynamic(
    fp32_model,
    {torch.nn.Linear},
    dtype=torch.qint8
)

In [None]:
quantized_model

LeNet5(
  (conv1): Conv2d(3, 6, kernel_size=(5, 5), stride=(1, 1))
  (conv2): Conv2d(6, 16, kernel_size=(15, 15), stride=(1, 1))
  (fc1): DynamicQuantizedLinear(in_features=400, out_features=120, dtype=torch.qint8, qscheme=torch.per_tensor_affine)
  (fc2): DynamicQuantizedLinear(in_features=120, out_features=84, dtype=torch.qint8, qscheme=torch.per_tensor_affine)
  (fc3): DynamicQuantizedLinear(in_features=84, out_features=10, dtype=torch.qint8, qscheme=torch.per_tensor_affine)
)

# Comparing Model Size between Fp32 and int8

In [None]:
def print_size_of_model(model, label=""):
  torch.save(model.state_dict(), "temp.p")
  size = os.path.getsize("temp.p")
  print("Model: ", label, ' \t', 'Size (KB):', size/1e3)
  os.remove('temp.p')
  return size

fp32_model_size = print_size_of_model(fp32_model, "fp32")
quantized_model_size = print_size_of_model(quantized_model, "int8")


Model:  fp32  	 Size (KB): 165.71
Model:  int8  	 Size (KB): 108.778


# Post-training Static Quantization


*   weights are Quantized (means FP32 to int8)
*   Activations are also Quantized
*   Calbibration required here means we have to adjust the scaling factors



In [None]:
Static_Quant_Model = LeNet5()

Static_Quant_Model.qconfig = torch.quantization.get_default_qconfig('fbgemm')

torch.quantization.prepare(
    Static_Quant_Model, inplace=True
)
torch.quantization.convert(
    Static_Quant_Model, inplace=True
)

LeNet5(
  (conv1): QuantizedConv2d(3, 6, kernel_size=(5, 5), stride=(1, 1), scale=1.0, zero_point=0)
  (conv2): QuantizedConv2d(6, 16, kernel_size=(15, 15), stride=(1, 1), scale=1.0, zero_point=0)
  (fc1): QuantizedLinear(in_features=400, out_features=120, scale=1.0, zero_point=0, qscheme=torch.per_channel_affine)
  (fc2): QuantizedLinear(in_features=120, out_features=84, scale=1.0, zero_point=0, qscheme=torch.per_channel_affine)
  (fc3): QuantizedLinear(in_features=84, out_features=10, scale=1.0, zero_point=0, qscheme=torch.per_channel_affine)
)

In [None]:
fp32_model_size = print_size_of_model(fp32_model, "fp32")
quantized_model_size = print_size_of_model(quantized_model, "int8")
Static_Quantized_model_size = print_size_of_model(Static_Quant_Model, "Static")

Model:  fp32  	 Size (KB): 165.71
Model:  int8  	 Size (KB): 108.778
Model:  Static  	 Size (KB): 93.846


Let's take any inbuilt model


In [None]:
import torchvision

In [None]:
quant_model = torchvision.models.quantization.mobilenet_v2(pretrained=True, quantize=True)
model_no_quant = torchvision.models.mobilenet_v2(pretrained=True)


def model_size(modl):
  torch.save(modl.state_dict(), "demo.pt")
  print("%.2f MB" %(os.path.getsize("demo.pt")/1e6))


model_size(quant_model)
model_size(model_no_quant)

Downloading: "https://download.pytorch.org/models/quantized/mobilenet_v2_qnnpack_37f702c5.pth" to /root/.cache/torch/hub/checkpoints/mobilenet_v2_qnnpack_37f702c5.pth
100%|██████████| 3.42M/3.42M [00:00<00:00, 14.4MB/s]
Downloading: "https://download.pytorch.org/models/mobilenet_v2-b0353104.pth" to /root/.cache/torch/hub/checkpoints/mobilenet_v2-b0353104.pth
100%|██████████| 13.6M/13.6M [00:00<00:00, 91.5MB/s]


3.62 MB
14.24 MB


# Quantization Aware Trainng

In [None]:
qat_model = LeNet5()

qat_model.qconfig = torch.ao.quantization.get_default_qat_qconfig('x86')

torch.quantization.prepare_qat(qat_model, inplace=True)

torch.quantization.convert(qat_model, inplace=True)

LeNet5(
  (conv1): QuantizedConv2d(3, 6, kernel_size=(5, 5), stride=(1, 1), scale=1.0, zero_point=0)
  (conv2): QuantizedConv2d(6, 16, kernel_size=(15, 15), stride=(1, 1), scale=1.0, zero_point=0)
  (fc1): QuantizedLinear(in_features=400, out_features=120, scale=1.0, zero_point=0, qscheme=torch.per_channel_affine)
  (fc2): QuantizedLinear(in_features=120, out_features=84, scale=1.0, zero_point=0, qscheme=torch.per_channel_affine)
  (fc3): QuantizedLinear(in_features=84, out_features=10, scale=1.0, zero_point=0, qscheme=torch.per_channel_affine)
)

In [None]:
fp32_model_size = print_size_of_model(fp32_model, "fp32")
quantized_model_size = print_size_of_model(quantized_model, "int8")
Static_Quantized_model_size = print_size_of_model(Static_Quant_Model, "Static")
qat_model_size = print_size_of_model(qat_model, "QAT");

Model:  fp32  	 Size (KB): 165.71
Model:  int8  	 Size (KB): 108.778
Model:  Static  	 Size (KB): 93.846
Model:  QAT  	 Size (KB): 93.782


In [None]:
t = torch.randn(1,5)
t

tensor([[-0.7223, -0.1920,  0.5035,  0.5588,  1.1065]])

In [None]:
qt = torch.quantize_per_tensor(t, scale=0.01, zero_point=0, dtype=torch.quint8)
qt

tensor([[0.0000, 0.0000, 0.5000, 0.5600, 1.1100]], size=(1, 5),
       dtype=torch.quint8, quantization_scheme=torch.per_tensor_affine,
       scale=0.01, zero_point=0)

In [None]:
float_tensor = torch.randn(2,2,3)
float_tensor


tensor([[[ 0.1204,  1.7465,  0.4542],
         [-2.3271,  0.4492, -0.9955]],

        [[ 0.5896, -0.0059,  0.7946],
         [-1.0846, -0.7051, -0.4628]]])

In [None]:
scales = torch.tensor([0.1,0.2,0.3])
dtype = torch.quint8
zero_points = torch.tensor([1,0,1])
channel_axis = 2

In [None]:
q_per_chanel = torch.quantize_per_channel(float_tensor, scales, zero_points, dtype=dtype, axis=channel_axis)
q_per_chanel

tensor([[[ 0.1000,  1.8000,  0.6000],
         [-0.1000,  0.4000, -0.3000]],

        [[ 0.6000,  0.0000,  0.9000],
         [-0.1000,  0.0000, -0.3000]]], size=(2, 2, 3), dtype=torch.quint8,
       quantization_scheme=torch.per_channel_affine,
       scale=tensor([0.1000, 0.2000, 0.3000], dtype=torch.float64),
       zero_point=tensor([1, 0, 1]), axis=2)