In [31]:
import warnings
warnings.filterwarnings("ignore")

import os
import matplotlib.pyplot as plt

plt.style.use("ggplot")


import torch
import torchvision
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import DataLoader
import torchvision.transforms as transforms

In [32]:
transform = transforms.Compose(
    [
     transforms.ToTensor(),
     transforms.Normalize((0.5,), (0.5,))])

trainset = torchvision.datasets.MNIST(root='./data', train=True,
                                        download=True, transform=transform)
trainloader = torch.utils.data.DataLoader(trainset, batch_size=64, # type: ignore
                                          shuffle=True)

testset = torchvision.datasets.MNIST(root='./data', train=False,
                                       download=True, transform=transform)
testloader = torch.utils.data.DataLoader(testset, batch_size=64, # type: ignore
                                         shuffle=False)

In [3]:
class MnistModel(nn.Module):
    def __init__(self):
        super(MnistModel, self).__init__()

        self.conv1 = nn.Conv2d(1, 32, kernel_size=3, stride=1, padding=1)
        self.bn1 = nn.BatchNorm2d(32)
        self.relu1 = nn.ReLU(inplace=True)

        self.conv2 = nn.Conv2d(32, 64, kernel_size=3, stride=1, padding=1)
        self.bn2 = nn.BatchNorm2d(64)
        self.relu2 = nn.ReLU(inplace=True)

        self.maxpool = nn.MaxPool2d(2, 2)  # Initialized here

        self.fc1 = nn.Linear(7*7*64, 512)
        self.relu3 = nn.ReLU(inplace=True)

        self.fc2 = nn.Linear(512, 10)

    def forward(self, x):
        x = self.conv1(x)
        x = self.bn1(x)
        x = self.relu1(x)
        x = self.maxpool(x)  

        x = self.conv2(x)
        x = self.bn2(x)
        x = self.relu2(x)
        x = self.maxpool(x) 

        x = x.reshape(x.shape[0], -1)
        x = self.fc1(x)
        x = self.relu3(x)

        x = self.fc2(x)
        return x

In [4]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device

device(type='cpu')

In [5]:
model = MnistModel()
model.to(device)

MnistModel(
  (conv1): Conv2d(1, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (bn1): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (relu1): ReLU(inplace=True)
  (conv2): Conv2d(32, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (relu2): ReLU(inplace=True)
  (maxpool): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (fc1): Linear(in_features=3136, out_features=512, bias=True)
  (relu3): ReLU(inplace=True)
  (fc2): Linear(in_features=512, out_features=10, bias=True)
)

In [6]:
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters())

In [7]:
import torchinfo
torchinfo.summary(model=model, input_size=(64, 1, 28, 28), verbose=0, col_names=["input_size", "output_size", "num_params", "trainable"])

Layer (type:depth-idx)                   Input Shape               Output Shape              Param #                   Trainable
MnistModel                               [64, 1, 28, 28]           [64, 10]                  --                        True
├─Conv2d: 1-1                            [64, 1, 28, 28]           [64, 32, 28, 28]          320                       True
├─BatchNorm2d: 1-2                       [64, 32, 28, 28]          [64, 32, 28, 28]          64                        True
├─ReLU: 1-3                              [64, 32, 28, 28]          [64, 32, 28, 28]          --                        --
├─MaxPool2d: 1-4                         [64, 32, 28, 28]          [64, 32, 14, 14]          --                        --
├─Conv2d: 1-5                            [64, 32, 14, 14]          [64, 64, 14, 14]          18,496                    True
├─BatchNorm2d: 1-6                       [64, 64, 14, 14]          [64, 64, 14, 14]          128                       True
├─ReLU:

In [9]:
from train_helpers import ClassifierTrainer, save_plots

trainer = ClassifierTrainer(
    model= model,
    optimizer=optimizer,
    criterion=criterion,
    train_loader=trainloader,
    val_loader=testloader,
    num_epochs=5,
    cuda=False
)
trainer.train()


Epoch : 1/5:   0%|          | 0/938 [00:00<?, ?it/s]

Epoch : 1/5: 100%|██████████| 938/938 [01:03<00:00, 14.82it/s, Accuracy=96.3, Loss=0.273]   


Validation Accuracy: 98.52% and Loss: 0.04262173857902671
Best validation loss: 0.04262173857902671
Saving best model for epoch: 1



Epoch : 2/5: 100%|██████████| 938/938 [01:13<00:00, 12.84it/s, Accuracy=98.578, Loss=0.0097]


Validation Accuracy: 98.49% and Loss: 0.04349938898966377


Epoch : 3/5: 100%|██████████| 938/938 [01:11<00:00, 13.07it/s, Accuracy=98.98, Loss=0.0058] 


Validation Accuracy: 99.06% and Loss: 0.02821018383197535
Best validation loss: 0.02821018383197535
Saving best model for epoch: 3



Epoch : 4/5: 100%|██████████| 938/938 [01:13<00:00, 12.80it/s, Accuracy=99.19, Loss=0.0473] 


Validation Accuracy: 98.96% and Loss: 0.034773673059488405


Epoch : 5/5: 100%|██████████| 938/938 [01:13<00:00, 12.72it/s, Accuracy=99.39, Loss=0.1013] 


Validation Accuracy: 99.14% and Loss: 0.029976082717447894


In [61]:
# save model (recomendation from docs)
model.eval()
torch.save(model.state_dict(), '../models/unquantize_model.pth')

# Dynamic Quantization

Dynamic quantization quantizes the model weights and is carried out dynamically during runtime. The activations are stored in their original floating-point format.

Since weights are quantized dynamically at runtime, it allows for more flexibility. It can be beneficial in handling cases where the range of values can vary. As activations remain in their original format, the accuracy loss is usually less than static quantization. Dynamic quantization doesn’t need calibration data, making it simpler to apply.

However, as dynamic quantization only quantizes the weights, not the activations, it provides less compression and speedup than static quantization. Since weights are quantized on-the-fly during inference, it may introduce some runtime overhead.

Dynamic Quantization is pretty straightforward and requires only a single step for quantization. Let’s load our Unquantized model’s weight:

In [11]:
# quantize model 
import torch.quantization
quantized_model = torch.quantization.quantize_dynamic(model, {torch.nn.Linear}, dtype=torch.qint8)


In [12]:
# save model (recomendation from docs)
torch.save(quantized_model.state_dict(), '../models/dyn_quantized_model.pth')

In [13]:
import os
def get_model_size(path:str):
    size = os.path.getsize(path)
    return size

In [14]:
print(f"Size dynamic quantize model: {get_model_size(path='../models/dyn_quantized_model.pth')/ 1024:.2f} MB")
print(f"Size of non quantize model: {get_model_size(path='../models/unquantize_model.pth') / 1024:.2f} MB")


Size dynamic quantize model: 1657.39 MB
Size of non quantize model: 6374.91 MB


In [25]:
quantized_model.eval()

MnistModel(
  (conv1): Conv2d(1, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (bn1): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (relu1): ReLU(inplace=True)
  (conv2): Conv2d(32, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (relu2): ReLU(inplace=True)
  (maxpool): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (fc1): DynamicQuantizedLinear(in_features=3136, out_features=512, dtype=torch.qint8, qscheme=torch.per_tensor_affine)
  (relu3): ReLU(inplace=True)
  (fc2): DynamicQuantizedLinear(in_features=512, out_features=10, dtype=torch.qint8, qscheme=torch.per_tensor_affine)
)

In [26]:
model.eval()

MnistModel(
  (conv1): Conv2d(1, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (bn1): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (relu1): ReLU(inplace=True)
  (conv2): Conv2d(32, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (relu2): ReLU(inplace=True)
  (maxpool): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (fc1): Linear(in_features=3136, out_features=512, bias=True)
  (relu3): ReLU(inplace=True)
  (fc2): Linear(in_features=512, out_features=10, bias=True)
)

In [27]:
from utils import ModelCompare
model_compare = ModelCompare(
    model1=quantized_model,
    model1_info="Quantized Model",
    model2=model,
    model2_info="Unquantized Model",
    cuda=False
)

print("="*50)
model_compare.compare_size()
print("="*50)
model_compare.compare_accuracy(dataloder=testloader)
print("="*50)
model_compare.compare_inference_time(N=2 , dataloder=testloader)

Model Quantized Model Size(Mb): 1.69575
Model Unquantized Model Size(Mb): 6.526686
The Quantized Model is smaller by 74.02%.
Accuracy of Quantized Model: 99.15
Accuracy of Unquantized Model: 99.14
Average inference time of Quantized Model over 2 iterations: 3.447869658470154
Average inference time of Unquantized Model over 2 iterations: 3.4155484437942505
The Unquantized Model is faster by 0.94%.


# Static quantization

Step 1. Set the model to evaluation mode with model.eval(). This is important as certain layers like dropout and batchnorm behave differently during training and evaluation.

In [50]:
import copy
unquant_model_copy = copy.deepcopy(model)
unquant_model_copy.eval()

MnistModel(
  (conv1): Conv2d(1, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (bn1): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (relu1): ReLU(inplace=True)
  (conv2): Conv2d(32, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (relu2): ReLU(inplace=True)
  (maxpool): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (fc1): Linear(in_features=3136, out_features=512, bias=True)
  (relu3): ReLU(inplace=True)
  (fc2): Linear(in_features=512, out_features=10, bias=True)
)

Step 2. Define the list of layers in your model architecture that can be fused together for the purpose of quantization. When performing quantization, certain groups of operations can be replaced by single operations that are equivalent but more computationally efficient. . For example, a convolution followed by a batch normalization, followed by a ReLU operation (Conv -> BatchNorm -> ReLU), can be replaced by a single fused ConvBnReLU operation. We will use torch.quantization.fuse_modules to fuse a list of modules into a single module. This has several advantages:

In [51]:
fused_layers = [['conv1', 'bn1', 'relu1'], ['conv2', 'bn2', 'relu2']]
fused_model = torch.quantization.fuse_modules(unquant_model_copy, fused_layers, inplace=True)

Step 3. Next, we will use the QuantizedModel wrapper class to wrap our model.



In [52]:
class QuantizedModel(torch.nn.Module):
    def __init__(self, model) -> None:
        super().__init__()
        self.fp32_model = model
        self.quant = torch.quantization.QuantStub()
        self.dequant = torch.quantization.DeQuantStub()
    
    def forward(self, x):
        x = self.quant(x)
        x = self.fp32_model(x)
        x = self.dequant(x)
        return x
    
quantized_model = QuantizedModel(model=fused_model)

The essence of this code is to add quantization and dequantization stubs to the model, which will act as ‘anchors’ to insert the actual quantization and dequantization functions in the model graph during the quantization process. The quant_layer converts the numbers in fp32 to int8 so that conv and relu will run in int8 format and then the dequant_layer will perform the int8 to fp32 conversion.

Step 4. Set the configuration for quantization using the get_default_qconfig function from torch.quantization. T

In [53]:
# Select quantization schemes from 
# https://pytorch.org/docs/stable/quantization-support.html

quantized_config = torch.quantization.get_default_qconfig("fbgemm")
quantized_model.qconfig = quantized_config

# Print quantization configurations
print(quantized_model.qconfig)

QConfig(activation=functools.partial(<class 'torch.ao.quantization.observer.HistogramObserver'>, reduce_range=True){}, weight=functools.partial(<class 'torch.ao.quantization.observer.PerChannelMinMaxObserver'>, dtype=torch.qint8, qscheme=torch.per_channel_symmetric){})


“fbgemm” is a high-performance, 8-bit quantization backend that is used on CPUs. It’s currently the recommended backend for quantization when deploying on servers. The qconfig attribute of a PyTorch model is used to specify how the model should be quantized. By assigning quantization_config to quantized_model.qconfig, you’re specifying that the model should be quantized according to the “fbgemm” backend’s default configuration.

Step 5. Prepare the model for quantization with the torch.quantization.prepare() function. The model is prepared in-place.

In [54]:
torch.quantization.prepare(quantized_model, inplace=True)

QuantizedModel(
  (fp32_model): MnistModel(
    (conv1): ConvReLU2d(
      (0): Conv2d(1, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (1): ReLU(inplace=True)
      (activation_post_process): HistogramObserver(min_val=inf, max_val=-inf)
    )
    (bn1): Identity()
    (relu1): Identity()
    (conv2): ConvReLU2d(
      (0): Conv2d(32, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (1): ReLU(inplace=True)
      (activation_post_process): HistogramObserver(min_val=inf, max_val=-inf)
    )
    (bn2): Identity()
    (relu2): Identity()
    (maxpool): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (fc1): Linear(
      in_features=3136, out_features=512, bias=True
      (activation_post_process): HistogramObserver(min_val=inf, max_val=-inf)
    )
    (relu3): ReLU(inplace=True)
    (fc2): Linear(
      in_features=512, out_features=10, bias=True
      (activation_post_process): HistogramObserver(min_val=inf, max_val=-inf)
    )
  )

Step 6. Calibrate the model with the test dataset. Run the model with a few examples to calibrate the quantization process.

In [55]:
def calibrate_model(model, loader, device=torch.device("cpu")):
    model.to(device)
    model.eval()

    for inputs, labels in loader:
        inputs, labels = inputs.to(device), labels.to(device)
        _ = model(inputs)

calibrate_model(model=quantized_model, loader=trainloader, device="cpu")

During the quantization process, floating-point values are mapped to integer values. For weights, the range is known as they’re static and don’t change post-training. However, activations can vary depending on the input to the network. Calibration, typically performed by passing a subset of the data through the model and collecting the outputs, helps estimate this range.

Step 7. Convert the prepared model to a quantized model using torch.quantization.convert(). The conversion is also done in-place.

In [56]:
quantized_model = torch.quantization.convert(quantized_model, inplace=True)
quantized_model.eval()

QuantizedModel(
  (fp32_model): MnistModel(
    (conv1): QuantizedConvReLU2d(1, 32, kernel_size=(3, 3), stride=(1, 1), scale=0.047439757734537125, zero_point=0, padding=(1, 1))
    (bn1): Identity()
    (relu1): Identity()
    (conv2): QuantizedConvReLU2d(32, 64, kernel_size=(3, 3), stride=(1, 1), scale=0.04675757512450218, zero_point=0, padding=(1, 1))
    (bn2): Identity()
    (relu2): Identity()
    (maxpool): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (fc1): QuantizedLinear(in_features=3136, out_features=512, scale=0.6587530970573425, zero_point=77, qscheme=torch.per_channel_affine)
    (relu3): ReLU(inplace=True)
    (fc2): QuantizedLinear(in_features=512, out_features=10, scale=0.5372301340103149, zero_point=66, qscheme=torch.per_channel_affine)
  )
  (quant): Quantize(scale=tensor([0.0157]), zero_point=tensor([64]), dtype=torch.quint8)
  (dequant): DeQuantize()
)

In [60]:
from utils import ModelCompare
model_compare = ModelCompare(
    model1=quantized_model,
    model1_info="Quantized Model",
    model2=model,
    model2_info="Uquantize model",
    cuda=False
)

print("="*50)
model_compare.compare_size()
print("="*50)
model_compare.compare_accuracy(dataloder=testloader)
print("="*50)
model_compare.compare_inference_time(N=2 , dataloder=testloader)

Model Quantized Model Size(Mb): 1.64939
Model Uquantize model Size(Mb): 6.526686
The Quantized Model is smaller by 74.73%.
Accuracy of Quantized Model: 99.16
Accuracy of Uquantize model: 99.14
Average inference time of Quantized Model over 2 iterations: 2.316429376602173
Average inference time of Uquantize model over 2 iterations: 2.6330199241638184
The Quantized Model is faster by 12.02%.


In [49]:
model

MnistModel(
  (conv1): Conv2d(1, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (bn1): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (relu1): ReLU(inplace=True)
  (conv2): Conv2d(32, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (relu2): ReLU(inplace=True)
  (maxpool): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (fc1): Linear(in_features=3136, out_features=512, bias=True)
  (relu3): ReLU(inplace=True)
  (fc2): Linear(in_features=512, out_features=10, bias=True)
)

In [None]:
# 

# Quantization Aware Training

Static quantization enables the generation of highly efficient quantized integer models for inference. However, despite careful post-training calibration, there may be instances where the model’s accuracy is compromised to an unacceptable extent. In such cases, post-training calibration alone is insufficient for generating a quantized integer model. To account for the quantization effect, the model needs to be trained in a manner that considers quantization. Quantization-aware training addresses this by incorporating fake quantization modules, which simulate the clamping and rounding effects of integer quantization at the specific points where quantization occurs during the conversion from floating-point to quantized integer models. These fake quantization modules also monitor the scales and zero points of the weights and activations. Once the quantization awareness training is completed, the floating-point model can be readily converted to a quantized integer model using the information stored in the fake quantization modules.

The Quantization Aware training process borrows similar steps from static quantizaion. Let’s load our Unquantized model’s weight:



In [65]:
# load the torch state 
state = torch.load("outputs/best_model.pth")
quant_network = MnistModel()

# loading the state dict
quant_network.load_state_dict(state['model_state_dict'])

<All keys matched successfully>

In [66]:
quant_network.eval()

MnistModel(
  (conv1): Conv2d(1, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (bn1): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (relu1): ReLU(inplace=True)
  (conv2): Conv2d(32, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (relu2): ReLU(inplace=True)
  (maxpool): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (fc1): Linear(in_features=3136, out_features=512, bias=True)
  (relu3): ReLU(inplace=True)
  (fc2): Linear(in_features=512, out_features=10, bias=True)
)

Step 1. Check the layers that can be fused and fuse the layers.

In [None]:
# check the layers that can be fused.
fused_layers = [['conv1', 'bn1', 'relu1'], ['conv2', 'bn2', 'relu2']]

# Fuse the layers
torch.quantization.fuse_modules(quant_network, fused_layers, inplace=True)

Step 2. Wrap the fused model using QuantizedModel


In [67]:
class QuantizedModel(torch.nn.Module):
    def __init__(self, model):
        super().__init__()
        self.model_fp32 = model
        self.quant = torch.quantization.QuantStub()
        self.dequant = torch.quantization.DeQuantStub()
        
    def forward(self, x):
        x = self.quant(x)
        x = self.model_fp32(x)
        x = self.dequant(x)
        return x

# Apply torch.quantization.QuantStub() and torch.quantization.QuantStub() to the inputs and outputs, respectively.
quant_network = QuantizedModel(quant_network)

Step 3. Set the configuration for quantization

In [68]:

# Select quantization schemes from 
quantization_config = torch.quantization.get_default_qconfig("fbgemm")
 
quant_network.qconfig = quantization_config

# Print quantization configurations
print(quant_network.qconfig)

QConfig(activation=functools.partial(<class 'torch.ao.quantization.observer.HistogramObserver'>, reduce_range=True){}, weight=functools.partial(<class 'torch.ao.quantization.observer.PerChannelMinMaxObserver'>, dtype=torch.qint8, qscheme=torch.per_channel_symmetric){})


Step 4. Prepare model for QAT.

In [69]:
torch.quantization.prepare_qat(quant_network, inplace=True)

QuantizedModel(
  (model_fp32): MnistModel(
    (conv1): Conv2d(
      1, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)
      (weight_fake_quant): PerChannelMinMaxObserver(min_val=tensor([]), max_val=tensor([]))
      (activation_post_process): HistogramObserver(min_val=inf, max_val=-inf)
    )
    (bn1): BatchNorm2d(
      32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True
      (activation_post_process): HistogramObserver(min_val=inf, max_val=-inf)
    )
    (relu1): ReLU(inplace=True)
    (conv2): Conv2d(
      32, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)
      (weight_fake_quant): PerChannelMinMaxObserver(min_val=tensor([]), max_val=tensor([]))
      (activation_post_process): HistogramObserver(min_val=inf, max_val=-inf)
    )
    (bn2): BatchNorm2d(
      64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True
      (activation_post_process): HistogramObserver(min_val=inf, max_val=-inf)
    )
    (relu2): ReLU(inplace=True)
    (maxp

Step 5. Train QAT model


In [70]:

trainer = ClassifierTrainer(
    model= quant_network,
    optimizer=optimizer, 
    criterion=criterion,
    train_loader=trainloader,
    val_loader=testloader,
    cuda=False,
    num_epochs=4
)

trainer.train(save_model=False)

Epoch : 1/4: 100%|██████████| 938/938 [01:38<00:00,  9.49it/s, Accuracy=99.495, Loss=0.001] 


Validation Accuracy: 99.11% and Loss: 0.02844548009582636


Epoch : 2/4: 100%|██████████| 938/938 [01:38<00:00,  9.48it/s, Accuracy=99.502, Loss=0.0212]


Validation Accuracy: 99.05% and Loss: 0.028112857355681278


Epoch : 3/4: 100%|██████████| 938/938 [01:37<00:00,  9.60it/s, Accuracy=99.502, Loss=0.0002]


Validation Accuracy: 99.07% and Loss: 0.028242666940514027


Epoch : 4/4: 100%|██████████| 938/938 [01:39<00:00,  9.45it/s, Accuracy=99.487, Loss=0.0046]


Validation Accuracy: 99.05% and Loss: 0.0281196557682897


In [71]:
from utils import ModelCompare
model_compare = ModelCompare(
    model1=quantized_model,
    model1_info="Quantized Model",
    model2=model,
    model2_info="UnQuantized Model",
    cuda=False)

print("="*50)
model_compare.compare_size()
print("="*50)
model_compare.compare_accuracy(dataloder=testloader)
print("="*50)
model_compare.compare_inference_time(N=10 , dataloder=testloader)


Model Quantized Model Size(Mb): 1.64939
Model UnQuantized Model Size(Mb): 6.526686
The Quantized Model is smaller by 74.73%.
Accuracy of Quantized Model: 99.16
Accuracy of UnQuantized Model: 99.14
Average inference time of Quantized Model over 10 iterations: 2.267369604110718
Average inference time of UnQuantized Model over 10 iterations: 3.1072409629821776
The Quantized Model is faster by 27.03%.
