## Jakob Ohmayer (4742300)

In [1]:
import os
import zipfile
import torch
import multiprocessing
import requests
import timeit
from torch.utils.data import DataLoader
from torchvision import transforms as T
from torchvision.datasets import CIFAR10
from tqdm import tqdm
import torch.nn as nn

### Load model ResNet34

In [2]:
import adapt

In [3]:
from models.resnet import resnet34

In [4]:
# Determine the number of CPU cores
num_cores = multiprocessing.cpu_count()

threads = num_cores * 2 # two threads are available on github codespaces per core
torch.set_num_threads(threads)

#maybe better performance
%env OMP_PLACES=cores
%env OMP_PROC_BIND=close
%env OMP_WAIT_POLICY=active

env: OMP_PLACES=cores
env: OMP_PROC_BIND=close
env: OMP_WAIT_POLICY=active


### Choose approximate multiplier 

Here the multiplier SPR12_44 is used. The multiplier ``SPR12_44.h`` is included in the folder and needs to be placed under ``/adapt/cpu-kernels/axx_mults`` to work properly.

In [5]:
axx_mult = 'SPR12_44'

### Load model for evaluation

In [6]:
model = resnet34(pretrained=True, axx_mult = axx_mult)

model.eval() # for evaluation

Using /root/.cache/torch_extensions as PyTorch extensions root...
Emitting ninja build file /root/.cache/torch_extensions/PyInit_conv2d_SPR12_44/build.ninja...
Building extension module PyInit_conv2d_SPR12_44...
Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)
ninja: no work to do.
Loading extension module PyInit_conv2d_SPR12_44...
Using /root/.cache/torch_extensions as PyTorch extensions root...
No modifications detected for re-loaded extension module PyInit_conv2d_SPR12_44, skipping build step...
Loading extension module PyInit_conv2d_SPR12_44...
Using /root/.cache/torch_extensions as PyTorch extensions root...
No modifications detected for re-loaded extension module PyInit_conv2d_SPR12_44, skipping build step...
Loading extension module PyInit_conv2d_SPR12_44...
Using /root/.cache/torch_extensions as PyTorch extensions root...
No modifications detected for re-loaded extension module PyInit_conv2d_SPR12_44, skipping bui

ResNet(
  (conv1): AdaPT_Conv2d(
    3, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False
    (quantizer): TensorQuantizer(8bit per-tensor amax=dynamic calibrator=HistogramCalibrator quant)
    (quantizer_w): TensorQuantizer(8bit per-tensor amax=dynamic calibrator=HistogramCalibrator quant)
  )
  (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (relu): ReLU(inplace=True)
  (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
  (layer1): Sequential(
    (0): BasicBlock(
      (conv1): AdaPT_Conv2d(
        64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False
        (quantizer): TensorQuantizer(8bit per-tensor amax=dynamic calibrator=HistogramCalibrator quant)
        (quantizer_w): TensorQuantizer(8bit per-tensor amax=dynamic calibrator=HistogramCalibrator quant)
      )
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(

### Load dataset


In [7]:
def val_dataloader(mean = (0.4914, 0.4822, 0.4465), std = (0.2471, 0.2435, 0.2616)):

    transform = T.Compose(
        [
            T.ToTensor(),
            T.Normalize(mean, std),
        ]
    )
    dataset = CIFAR10(root="datasets/cifar10_data", train=False, download=True, transform=transform)
    dataloader = DataLoader(
        dataset,
        batch_size=128,
        num_workers=0,
        drop_last=True,
        pin_memory=False,
    )
    return dataloader

transform = T.Compose(
        [
            T.RandomCrop(32, padding=4),
            T.RandomHorizontalFlip(),
            T.ToTensor(),
            T.Normalize(mean = (0.4914, 0.4822, 0.4465), std = (0.2471, 0.2435, 0.2616)),
        ]
    )
dataset = CIFAR10(root="datasets/cifar10_data", train=True, download=True, transform=transform)

evens = list(range(0, len(dataset), 10))
trainset_1 = torch.utils.data.Subset(dataset, evens)

data = val_dataloader()

# data_t is used for calibration purposes and is a subset of train-set
data_t = DataLoader(trainset_1, batch_size=128,
                                            shuffle=False, num_workers=0)


Files already downloaded and verified
Files already downloaded and verified


### Run model calibration for quantization

Calibrates the quantization parameters. Is rerun after each training epoch.

In [8]:
from pytorch_quantization import nn as quant_nn
from pytorch_quantization import calib

def collect_stats(model, data_loader, num_batches):
     """Feed data to the network and collect statistic"""

     # Enable calibrators
     for name, module in model.named_modules():
         if isinstance(module, quant_nn.TensorQuantizer):
             if module._calibrator is not None:
                 module.disable_quant()
                 module.enable_calib()
             else:
                 module.disable()

     for i, (image, _) in tqdm(enumerate(data_loader), total=num_batches):
         model(image.cpu())
         if i >= num_batches:
             break

     # Disable calibrators
     for name, module in model.named_modules():
         if isinstance(module, quant_nn.TensorQuantizer):
             if module._calibrator is not None:
                 module.enable_quant()
                 module.disable_calib()
             else:
                 module.enable()

def compute_amax(model, **kwargs):
 # Load calib result
 for name, module in model.named_modules():
     if isinstance(module, quant_nn.TensorQuantizer):
         if module._calibrator is not None:
             if isinstance(module._calibrator, calib.MaxCalibrator):
                 module.load_calib_amax()
             else:
                 module.load_calib_amax(**kwargs)
         print(F"{name:40}: {module}")
 model.cpu()

def calibrate_model(model, data_t):
    # It is a bit slow since we collect histograms on CPU
    with torch.no_grad():
        stats = collect_stats(model, data_t, num_batches=2)
        amax = compute_amax(model, method="percentile", percentile=99.99)
    
        # optional - test different calibration methods
        #amax = compute_amax(model, method="mse")
        #amax = compute_amax(model, method="entropy")


In [9]:
calibrate_model(model, data_t)

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:08<00:00,  4.25s/it]
W1118 13:17:27.472463 129753190664000 tensor_quantizer.py:173] Disable HistogramCalibrator
W1118 13:17:27.472995 129753190664000 tensor_quantizer.py:173] Disable HistogramCalibrator
W1118 13:17:27.473452 129753190664000 tensor_quantizer.py:173] Disable HistogramCalibrator
W1118 13:17:27.473854 129753190664000 tensor_quantizer.py:173] Disable HistogramCalibrator
W1118 13:17:27.474242 129753190664000 tensor_quantizer.py:173] Disable HistogramCalibrator
W1118 13:17:27.474998 129753190664000 tensor_quantizer.py:173] Disable HistogramCalibrator
W1118 13:17:27.475342 129753190664000 tensor_quantizer.py:173] Disable HistogramCalibrator
W1118 13:17:27.475764 129753190664000 tensor_quantizer.py:173] Disable HistogramCalibrator
W1118 13:17:27.476261 129753190664000 tensor_quantizer.py:173] Disable HistogramCalibrator
W1118 13:17:27.476659 1

W1118 13:17:27.525258 129753190664000 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([]).
W1118 13:17:27.525842 129753190664000 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([]).
W1118 13:17:27.526487 129753190664000 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([]).
W1118 13:17:27.527717 129753190664000 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([]).
W1118 13:17:27.528644 129753190664000 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([]).
W1118 13:17:27.530295 129753190664000 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([]).
W1118 13:17:27.531464 129753190664000 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([]).
W1118 13:17:27.532527 129753190664000 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([]).
W1118 13:17:27.533547 129753190664000 tensor_quantizer.py:237] Load calibrated amax, shape=torch.Size([]).
W1118 13:17:27.534652 129753190664000

conv1.quantizer                         : TensorQuantizer(8bit per-tensor amax=2.1255 calibrator=HistogramCalibrator quant)
conv1.quantizer_w                       : TensorQuantizer(8bit per-tensor amax=0.1418 calibrator=HistogramCalibrator quant)
layer1.0.conv1.quantizer                : TensorQuantizer(8bit per-tensor amax=0.6744 calibrator=HistogramCalibrator quant)
layer1.0.conv1.quantizer_w              : TensorQuantizer(8bit per-tensor amax=0.0555 calibrator=HistogramCalibrator quant)
layer1.0.conv2.quantizer                : TensorQuantizer(8bit per-tensor amax=0.3303 calibrator=HistogramCalibrator quant)
layer1.0.conv2.quantizer_w              : TensorQuantizer(8bit per-tensor amax=0.0327 calibrator=HistogramCalibrator quant)
layer1.1.conv1.quantizer                : TensorQuantizer(8bit per-tensor amax=0.6067 calibrator=HistogramCalibrator quant)
layer1.1.conv1.quantizer_w              : TensorQuantizer(8bit per-tensor amax=0.0395 calibrator=HistogramCalibrator quant)
layer1.1

### Run model evaluation (before re-training)

In [17]:
correct = 0
total = 0

model.eval()
start_time = timeit.default_timer()
with torch.no_grad():
    for iteraction, (images, labels) in tqdm(enumerate(data), total=len(data)):
        images, labels = images.to("cpu"), labels.to("cpu")
        outputs = model(images)
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()
print(timeit.default_timer() - start_time)
print('Accuracy of the network on the 10000 test images: %.4f %%' % (
    100 * correct / total))

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 78/78 [15:44<00:00, 12.11s/it]

944.601157688001
Accuracy of the network on the 10000 test images: 33.7440 %





**Accuracy before re-training**: 33.74% \
This low accuracy is not suprising as firstly a approximate multiplier is used which will reduce the accuracy depending on how much approximation happens. Secondly the model ResNet34 used here is pretrained on ImageNet. For evaluation Cifar10 is used. Therefore the accuracy will be lower.

### Run approximate-aware re-training for 15 epochs


As re-training takes quite long and brakes sometimes the individual models are saved after each epoch to enable loading these models to continue fine-tuning. Also the re-training was done using the python script ``Homework03.py`` included in the folder. Therefore the log output of the following cell is not correct. The majority of the output is included in the file ``training.log`` in the same folder. The difference in runtime is due to using a 2-core instance at the beginning and a 4-core instance later.

The pre-trained model after 15 epochs can be found [here](https://mega.nz/file/BRQWCLbJ#wA-qBEAAWPe3Ym9y2nB8v1PATPe_xRHGd1-U7OYnPNs) to reproduce the results. The model needs to be placed under a folder ``saved`` in the ``examples`` directory.

In [10]:
from adapt.references.classification.train import evaluate, train_one_epoch, load_data

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=0.0001)
lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=1, gamma=0.1)

EPOCHS = 15

# load model if necessary
load_epoch = 15
if load_epoch > 0:
    model.load_state_dict(torch.load(f"./saved/retrained_model_epoch_{load_epoch}.pth"))
    #calibrate_model(model, data_t)

# finetune the model for one epoch based on data_t subset
for epoch in range(EPOCHS):
    EPOCH_NUM = epoch + 1
    if EPOCH_NUM > load_epoch:
        print(f"Epoch {epoch + 1}/{EPOCHS}")
        train_one_epoch(model, criterion, optimizer, data_t, "cpu", EPOCH_NUM, 1)
        calibrate_model(model, data_t)

        # Save the model after each epoch
        model_path = f"./saved/retrained_model_epoch_{EPOCH_NUM}.pth"
        torch.save(model.state_dict(), model_path)
        print(f"Model saved to {model_path} after epoch {EPOCH_NUM}")
    else:
        print(f"Skipping epoch {EPOCH_NUM} as pretrained model exists")

W1118 13:17:36.388638 129753190664000 tensor_quantizer.py:402] conv1.quantizer: Overwriting amax.
W1118 13:17:36.389351 129753190664000 tensor_quantizer.py:402] conv1.quantizer_w: Overwriting amax.
W1118 13:17:36.390566 129753190664000 tensor_quantizer.py:402] layer1.0.conv1.quantizer: Overwriting amax.
W1118 13:17:36.391044 129753190664000 tensor_quantizer.py:402] layer1.0.conv1.quantizer_w: Overwriting amax.
W1118 13:17:36.392193 129753190664000 tensor_quantizer.py:402] layer1.0.conv2.quantizer: Overwriting amax.
W1118 13:17:36.392657 129753190664000 tensor_quantizer.py:402] layer1.0.conv2.quantizer_w: Overwriting amax.
W1118 13:17:36.393955 129753190664000 tensor_quantizer.py:402] layer1.1.conv1.quantizer: Overwriting amax.
W1118 13:17:36.394437 129753190664000 tensor_quantizer.py:402] layer1.1.conv1.quantizer_w: Overwriting amax.
W1118 13:17:36.395371 129753190664000 tensor_quantizer.py:402] layer1.1.conv2.quantizer: Overwriting amax.
W1118 13:17:36.395950 129753190664000 tensor_qu

Skipping epoch 1 as pretrained model exists
Skipping epoch 2 as pretrained model exists
Skipping epoch 3 as pretrained model exists
Skipping epoch 4 as pretrained model exists
Skipping epoch 5 as pretrained model exists
Skipping epoch 6 as pretrained model exists
Skipping epoch 7 as pretrained model exists
Skipping epoch 8 as pretrained model exists
Skipping epoch 9 as pretrained model exists
Skipping epoch 10 as pretrained model exists
Skipping epoch 11 as pretrained model exists
Skipping epoch 12 as pretrained model exists
Skipping epoch 13 as pretrained model exists
Skipping epoch 14 as pretrained model exists
Skipping epoch 15 as pretrained model exists


### Rerun model evaluation after re-training

In [11]:
correct = 0
total = 0

model.eval()
start_time = timeit.default_timer()
with torch.no_grad():
    for iteraction, (images, labels) in tqdm(enumerate(data), total=len(data)):
        images, labels = images.to("cpu"), labels.to("cpu")
        outputs = model(images)
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()
print(timeit.default_timer() - start_time)
print('Accuracy of the network on the 10000 test images: %.4f %%' % (
    100 * correct / total))

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 78/78 [15:45<00:00, 12.12s/it]

945.4669707930002
Accuracy of the network on the 10000 test images: 87.8205 %





**Accuracy after re-training**: 87.14% \
(runtime speed before and after fine-tuning is the same)

As expected the accuracy increases from 33.74% to 87.14%. This is due to two reasons. Firstly now the re-training happens with the Cifar10 dataset compared to the pre-trained model trained on ImageNet. While the pre-trained model will without approximation still have a decent performance as both datasets are similar re-training will increase the accuracy for this specific domain. \
The bigger impact comes from doing approximate aware re-training. The model adjusts the weights to facilitate an approximate multiplication in the process. As a neural network is inherently unprecise and deals with that problem internally already this works quite well and will increase the accuracy.