# ELE6310 - Assignment 1 - Quantization

#### Name: 
#### Student ID: 

In [None]:
#@title Mount your Google Drive
%matplotlib inline
%load_ext autoreload
%autoreload 2

from google.colab import drive
drive.mount('/content/gdrive')


In [None]:
#@title Link your assignment folder & install requirements
#@markdown Enter the path to the assignment folder in your Google Drive
import sys
import os
import shutil
import warnings

folder = "/content/gdrive/MyDrive/ELE6310/A1" #@param {type:"string"}
!ln -Ts $folder /content/A1 2> /dev/null

# Add the assignment folder to Python path
if '/content/A1' not in sys.path:
    sys.path.insert(0, '/content/A1')

# Install requirements
!pip install -qr /content/A1/requirements.txt

# Check if CUDA is available
import torch
if not torch.cuda.is_available():
    warnings.warn('CUDA is not available.')

## 1- Calibration [50 pts]

In [None]:
import solution
from common.test_functions import *
import torch
import numpy as np
import random
from matplotlib import pyplot as plt
import os

* First, complete `linear_quantize`, `linear_dequantize`, `update_scale_and_zero_point`, and `get_scale` functions  in `solution.py`and then run the below tests.

In [None]:
test_linear_quantize()
test_linear_dequantize()
test_update_scale_and_zero_point()

* Now we will see the performance of each quantization method on there different dataset.

In [None]:
data = torch.load(os.path.join(folder,'Dataset_A.t')) 
plot_real_dequantized_histogram(data, N_bits=2)

In [None]:
data = torch.load(os.path.join(folder,'Dataset_B.t'))
plot_real_dequantized_histogram(data, N_bits=2)

* Compare your results. Which method works better? Do you think the quantiztion error has a bias? explain your observation.


\begin{array}{|c|ccc|ccc|}\hline\\ 
     Dataset && A &&& B \\ \hline
Bit width & 8 & 4 & 2 & 8 & 4 & 2 \\ \hline
Symmetric & ?? & ?? & ?? & ?? & ?? & ?? \\ 
Heuristic Method & ?? & ?? & ?? & ?? & ?? & ?? \\ 
SAWB & ?? & ?? & ?? & ?? & ?? & ?? \\ \hline
\end{array}

## 2- PTQ -vs- QTA [60 pts]

* Complete `quantize_func_STE` in order to the quantization block (linear quantize and dequntize together) to meet the STE condition.


* Complete `quantized_linear_function` and `quantized_conv2d_function` function only using `integer_linear` and `integer_conv2d`.

In [None]:
test_STE_grad()

In [None]:
test_quantized_linear_function(weight_N_bits=2, act_N_bits=8, method='SAWB', bias=False)
test_quantized_linear_module(weight_N_bits=8, act_N_bits=8, method='sym', bias=False)

In [None]:
test_quantized_conv2d_function(weight_N_bits=2, act_N_bits=2, method='sym', bias=False)
test_quantized_conv2d_module(weight_N_bits=8, act_N_bits=8, method='SAWB', bias=False)

* In this assignment we use resnet20 with pre-trained weights on CIFAR10. First, Let's see the accuracy and model size of our network.

In [None]:
from common.utils import load_CIFAR10_dataset, evaluate, fit, model_size
from common.resnet import resnet20

In [None]:
Seed = 6310
torch.manual_seed(Seed)
np.random.seed(Seed)
random.seed(Seed)
if torch.cuda.is_available():
    torch.cuda.manual_seed(Seed)
    torch.cuda.manual_seed_all(Seed)

In [None]:
train_loader, test_loader, calibration_loader = load_CIFAR10_dataset(batch_size=256, calibration_batch_size=1024)
model = resnet20(pretrained=True, save_path='./save/')
device = torch.device('cuda:0')
model.to(device)

accuracy = evaluate(model, test_loader, device)
print("test accuracy of fp model:", accuracy)
model_size(model)

* In the first step, we use calibration set to initial scale factors of the activation in each layer.

In [None]:
method='sym'
act_N_bits=4
weight_N_bits=4
quantized_model = model_to_quant(model, calibration_loader, act_N_bits, weight_N_bits,method, device)

In [None]:
accuracy = evaluate(quantized_model, test_loader, device)
print("test accuracy of fp model:", accuracy)
model_size(quantized_model)

In [None]:
plot_layers_histogram(quantized_model)


* Try `W8A8`, `W4A4`, `W2A2`, `W8A2`, and `W2A8` quantization.

* Now try to fine-tune the specified models using any desired training method, and save the best performing model

In [None]:
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(quantized_model.parameters(), 1e-4, momentum=0.9, weight_decay=0.0005, nesterov=True)
scheduler = None

train_accuracy, test_accuracy = fit(quantized_model, 5, train_loader, test_loader, criterion, optimizer, scheduler, device)


## 3- Variable precision [30 pts]

In the  Variable precision (or "Mixed-precision") method, each layer is quantized with different bit precision. In this part, we want to find the optimal model-size for the resnet20 on the CIFAR-10 dataset. For this part, we only focus on weight quantization (with signed symmetric method) and we keep the activation in 16 bit.

* Use a method of your choice to find the optimal model size with a constraint on test accuracy above 85\%. 
Any reasonable attempt at exploring the design space will give you full marks. Better approaches/results will be considered for bonus points. 

\begin{array}{|c|cc|cc|}\hline\\ 
      & PTQ && QAT \\ \hline
method & Symmetric & SAWB & Symmetric & SAWB \\ \hline
W8A8 & ?? & ?? & ?? & ?? \\ 
W4A4 & ?? & ?? & ?? & ?? \\ 
W2A2 & ?? & ?? & ?? & ?? \\ 
W8A2 & ?? & ?? & ?? & ?? \\ 
W2A8 & ?? & ?? & ?? & ?? \\ \hline
\end{array}

In [None]:
bitwidth_dict = {
    'layer1.0.conv1': 8,
    'layer1.0.conv2': 8,
    'layer1.1.conv1': 8,
    'layer1.1.conv2': 8,
    'layer1.2.conv1': 8,
    'layer1.2.conv2': 8,
    
    'layer2.0.conv1': 8,
    'layer2.0.conv2': 8,
    'layer2.0.downsample.0': 8,
    'layer2.1.conv1': 8,
    'layer2.1.conv2': 8,
    'layer2.2.conv1': 8,
    'layer2.2.conv2': 8,
    
    'layer3.0.conv1': 8,
    'layer3.0.conv2': 8,
    'layer3.0.downsample.0': 8,
    'layer3.1.conv1': 8,
    'layer3.1.conv2': 8,
    'layer3.2.conv1': 8,
    'layer3.2.conv2': 8,
    
    'fc': 8
}


In [None]:
method='sym'
act_N_bits=16
quantized_model = model_to_quant(model, calibration_loader, act_N_bits, weight_N_bits,method, q_method, device, bitwidth_dict)

In [None]:
accuracy = evaluate(quantized_model, test_loader, device)
print("test accuracy of fp model:", accuracy)
model_size(quantized_model)