1. PTQ Static with calibration MinMax (per-tensor and per-channel via fbgemm)
2. CNN, CNN with Early Exit and Pre-trained models (MobileNetV2, ResNet50, ResNeXt50, EfficientNet-B0)
3. Dataset: CIFAR-10, CIFAR-100, Tiny ImageNet
4. Target: CPU inference

### Libraries

In [1]:

import os
import io
import time
import copy
import shutil
import zipfile
import urllib.request
from tqdm import tqdm
from pathlib import Path
from tabulate import tabulate
import matplotlib.pyplot as plt
from typing import Optional, Literal, Dict, Any, List

import torch
import torch.nn as nn
import torch.optim as optim
import torch.ao.quantization
import torch.nn.functional as F
from torchvision import datasets, transforms, models
from torch.quantization import QuantStub, DeQuantStub, prepare_qat, convert

In [2]:
print(f"PyTorch: {torch.__version__}")
print(torch.backends.quantized.supported_engines)
device  = torch.device("cpu")
results = []

PyTorch: 2.5.1+cpu
['none', 'onednn', 'x86', 'fbgemm']


### Dataloader

In [3]:
DatasetName = Literal["cifar10", "cifar100", "tiny_imagenet"]

class DataloaderManager:
    CONFIGS = {
        "cifar10": {
            "size": 32, "padding": 4, "num_classes": 10,
            "stats": ((0.4914, 0.4822, 0.4465), (0.2470, 0.2435, 0.2616))
        },
        "cifar100": {
            "size": 32, "padding": 4, "num_classes": 100,
            "stats": ((0.5071, 0.4867, 0.4408), (0.2675, 0.2565, 0.2761))
        },
        "tiny_imagenet": {
            "size": 64, "padding": 8, "num_classes": 200,
            "stats": ((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))
        }
    }

    def __init__(self, root_dir: str, dataset: DatasetName):
        self.root = Path(root_dir)
        self.dataset = dataset
        self.cfg = self.CONFIGS[dataset]
        
    def _get_transforms(self, train: bool):
        mean, std = self.cfg["stats"]
        tf_list = []
        if train:
            tf_list.extend([
                transforms.RandomHorizontalFlip(),
                transforms.RandomCrop(self.cfg["size"], padding=self.cfg["padding"])
            ])
        tf_list.extend([transforms.ToTensor(), transforms.Normalize(mean, std)])
        return transforms.Compose(tf_list)

    def _get_dataset_instance(self, train: bool):
        tf = self._get_transforms(train)
        if self.dataset.startswith("cifar"):
            ds_cls = datasets.CIFAR10 if self.dataset == "cifar10" else datasets.CIFAR100
            return ds_cls(self.root, train=train, download=True, transform=tf)
        
        path = self.root / "tiny-imagenet-200" / ("train" if train else "val")
        return datasets.ImageFolder(str(path), transform=tf)

    def get_loaders(self, batch_size: int, num_workers: int):
        train_ds = self._get_dataset_instance(train=True)
        val_ds   = self._get_dataset_instance(train=False)

        loader_args = {"batch_size": batch_size, "num_workers": num_workers, "pin_memory": True}
        
        train_loader = torch.utils.data.DataLoader(train_ds, shuffle=True, **loader_args)
        val_loader   = torch.utils.data.DataLoader(val_ds, shuffle=False, **loader_args)

        self._print_summary(train_loader, val_loader, batch_size)
        return train_loader, val_loader

    def _print_summary(self, train_loader, val_loader, batch_size):
        print(f"Dataset      : {self.dataset}")
        print(f"Train samples: {len(train_loader.dataset)}")
        print(f"Val samples  : {len(val_loader.dataset)}")
        print(f"Classes      : {self.cfg['num_classes']}")
        print(f"Batch size   : {batch_size}")
        print(f"Train batches: {len(train_loader)}")
        print(f"Val batches  : {len(val_loader)}")

In [4]:
manager = DataloaderManager(root_dir="../data", dataset="cifar10")
train_loader, val_loader = manager.get_loaders(batch_size=64, num_workers=4)
num_classes = manager.cfg["num_classes"]

Files already downloaded and verified
Files already downloaded and verified
Dataset      : cifar10
Train samples: 50000
Val samples  : 10000
Classes      : 10
Batch size   : 64
Train batches: 782
Val batches  : 157


### Training and evaluation functions 

In [5]:
def train_model(
    model: nn.Module, 
    train_loader, 
    val_loader,
    device, 
    epochs: int, 
    lr: float, 
    model_name: str = "model"
) -> nn.Module:
    
    print(f"Training {model_name} | Epochs: {epochs} | LR: {lr}")
    model = model.to(device)
    
    optimizer = optim.SGD(model.parameters(), lr=lr, momentum=0.9, weight_decay=1e-4)
    scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, epochs)
    criterion = nn.CrossEntropyLoss()
    
    best_acc = 0.0

    for epoch in range(epochs):
        model.train()
        running_loss = 0.0
        pbar = tqdm(train_loader, desc=f"Epoch {epoch+1}/{epochs}")

        for data, target in pbar:
            data, target = data.to(device), target.to(device)
            
            optimizer.zero_grad()
            output = model(data)
            loss = criterion(output, target)
            loss.backward()
            optimizer.step()

            running_loss += loss.item()
            pbar.set_postfix(loss=f"{running_loss/len(train_loader):.4f}")
            
        patience = 5
        trigger_times = 0
        acc = validate_model(model, val_loader, device, desc=f"Eval Ep {epoch+1}")
        scheduler.step()
        
        print(f"Result Epoch {epoch+1}: Loss = {running_loss/len(train_loader):.4f} | Acc = {acc:.2f}%")

        if acc > best_acc:
            best_acc = acc
            torch.save(model.state_dict(), f"best_{model_name}.pth")
            trigger_times = 0
        else:
            trigger_times += 1
            if trigger_times >= patience:
                print("Early stopping!")
                break

    model.load_state_dict(torch.load(f"best_{model_name}.pth", weights_only=True))
    return model

def validate_model(model: nn.Module, data_loader, device, desc="Validating") -> float:
    model.eval()
    correct = 0
    total = 0
    
    with torch.no_grad():
        for data, target in tqdm(data_loader, desc=desc, leave=False):
            data, target = data.to(device), target.to(device)
            output = model(data)
            pred = output.argmax(dim=1)
            correct += pred.eq(target).sum().item()
            total += target.size(0)
            
    return 100.0 * correct / total

def evaluate_model(model: nn.Module, test_loader, device) -> float:
    model.eval()
    correct = total = 0
    with torch.no_grad():
        for data, target in test_loader:
            data, target = data.to(device), target.to(device)
            pred    = model(data).argmax(dim=1)
            correct += pred.eq(target).sum().item()
            total   += target.size(0)
    return 100.0 * correct / total

def measure_inference_time(model: nn.Module, test_loader, device, num_batches: int = 20) -> float:
    """Return mean latency per batch in milliseconds."""
    model.eval()
    times = []
    with torch.no_grad():
        for i, (data, _) in enumerate(test_loader):
            if i >= num_batches:
                break
            data = data.to(device)
            if i < 3:   # warmup
                _ = model(data)
                continue
            t0 = time.time()
            _  = model(data)
            times.append(time.time() - t0)
    return sum(times) / len(times) * 1000

def get_model_size(model: nn.Module) -> float:
    buf = io.BytesIO()
    torch.save(model.state_dict(), buf)
    return buf.tell() / 1024 / 1024

# PTQ

## Calibration MinMax

### Experiment 1

#### CNN without observers

In [6]:
class CNN_NO_OBS(nn.Module):
    def __init__(self, num_classes):
        super().__init__()
        
        self.conv = nn.Sequential(
            nn.Conv2d(3, 32, 3, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(2),

            nn.Conv2d(32, 64, 3, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(2)
        )

        self.fc = nn.Linear(64 * 8 * 8, num_classes)

    def forward(self, x):
        x = self.conv(x)
        x = torch.flatten(x, 1)
        return self.fc(x)

#### Training

In [8]:
cnn_fp32 = CNN_NO_OBS(num_classes=10).to(device)
cnn_fp32 = train_model(
    cnn_fp32, train_loader, val_loader, device,
    epochs=5, lr=0.01, model_name="experiment1_cnn_fp32"
)

acc_cnn_fp32  = evaluate_model(cnn_fp32, val_loader, device)
time_cnn_fp32 = measure_inference_time(cnn_fp32, val_loader, device)
size_cnn_fp32 = get_model_size(cnn_fp32)

print(f"Results:")
print(f"Acc     : {acc_cnn_fp32:.2f}%")
print(f"Latency : {time_cnn_fp32:.2f} ms/batch")
print(f"Size    : {size_cnn_fp32:.5f} MB")

Training experiment1_cnn_fp32 | Epochs: 5 | LR: 0.01


Epoch 1/5: 100%|██████████| 782/782 [00:23<00:00, 32.61it/s, loss=1.5351]
                                                            

Result Epoch 1: Loss = 1.5351 | Acc = 59.94%


Epoch 2/5: 100%|██████████| 782/782 [00:23<00:00, 33.77it/s, loss=1.1884]
                                                            

Result Epoch 2: Loss = 1.1884 | Acc = 63.85%


Epoch 3/5: 100%|██████████| 782/782 [00:23<00:00, 33.85it/s, loss=1.0591]
                                                            

Result Epoch 3: Loss = 1.0591 | Acc = 67.41%


Epoch 4/5: 100%|██████████| 782/782 [00:23<00:00, 33.83it/s, loss=0.9651]
                                                            

Result Epoch 4: Loss = 0.9651 | Acc = 70.57%


Epoch 5/5: 100%|██████████| 782/782 [00:23<00:00, 33.20it/s, loss=0.9092]
                                                            

Result Epoch 5: Loss = 0.9092 | Acc = 71.80%
Results:
Acc     : 71.80%
Latency : 5.66 ms/batch
Size    : 0.23249 MB


In [9]:
state_dict = torch.load("best_experiment1_cnn_fp32.pth", map_location="cpu", weights_only=True)
cnn_fp32.load_state_dict(state_dict)
cnn_fp32.eval()

print("Weights Structure:")
for name, param in cnn_fp32.named_parameters():
    print(f"{name} -> {param.shape}")
    
print('='*42)

print("Weights Statistics:\n")

for name, param in cnn_fp32.named_parameters():
    print(f"{name}:")
    print(param.data.view(-1)[:10])
    print(f"Dtype: {param.dtype}")
    print(f"Shape: {tuple(param.shape)}")
    print(f"Mean: {param.data.mean().item():.6f}")
    print(f"Std:  {param.data.std().item():.6f}")
    print('-'*42)
  
print('Datatype model:')  
print(next(cnn_fp32.parameters()).dtype)

Weights Structure:
conv.0.weight -> torch.Size([32, 3, 3, 3])
conv.0.bias -> torch.Size([32])
conv.3.weight -> torch.Size([64, 32, 3, 3])
conv.3.bias -> torch.Size([64])
fc.weight -> torch.Size([10, 4096])
fc.bias -> torch.Size([10])
Weights Statistics:

conv.0.weight:
tensor([-0.0083, -0.2532,  0.1813, -0.3246,  0.0511, -0.0176,  0.1043,  0.1120,
         0.0295,  0.1621])
Dtype: torch.float32
Shape: (32, 3, 3, 3)
Mean: 0.000135
Std:  0.197591
------------------------------------------
conv.0.bias:
tensor([-0.0279, -0.4891, -0.1486, -0.4258, -0.4363, -0.2449, -0.4396, -0.1404,
        -0.0152, -0.1232])
Dtype: torch.float32
Shape: (32,)
Mean: -0.236521
Std:  0.191881
------------------------------------------
conv.3.weight:
tensor([-0.0315, -0.0453,  0.0132, -0.0238, -0.0111,  0.0778,  0.0407,  0.0863,
         0.1192,  0.0064])
Dtype: torch.float32
Shape: (64, 32, 3, 3)
Mean: -0.012429
Std:  0.056563
------------------------------------------
conv.3.bias:
tensor([ 0.0652,  0.0295, -0

#### CNN with observers

In [10]:
class CNN_OBS(nn.Module):
    def __init__(self, num_classes):
        super().__init__()
        
        self.quant   = QuantStub()
        self.dequant = DeQuantStub() 
        self.conv = nn.Sequential(
            nn.Conv2d(3, 32, 3, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(2),

            nn.Conv2d(32, 64, 3, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(2)
        )
        self.fc = nn.Linear(64 * 8 * 8, num_classes)

    def forward(self, x):
        x = self.quant(x)
        x = self.conv(x)
        x = torch.flatten(x, 1)
        x = self.fc(x)
        x = self.dequant(x)
        return x

#### Calibration

In [None]:
cnn_quantized = CNN_OBS(num_classes=10).to(device)
cnn_quantized.load_state_dict(cnn_fp32.state_dict())
cnn_quantized.eval()

cnn_quantized.qconfig = torch.ao.quantization.default_qconfig
cnn_quantized = torch.ao.quantization.prepare(cnn_quantized)
cnn_quantized

CNN_OBS(
  (quant): QuantStub(
    (activation_post_process): MinMaxObserver(min_val=inf, max_val=-inf)
  )
  (dequant): DeQuantStub()
  (conv): Sequential(
    (0): Conv2d(
      3, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)
      (activation_post_process): MinMaxObserver(min_val=inf, max_val=-inf)
    )
    (1): ReLU()
    (2): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (3): Conv2d(
      32, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)
      (activation_post_process): MinMaxObserver(min_val=inf, max_val=-inf)
    )
    (4): ReLU()
    (5): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  )
  (fc): Linear(
    in_features=4096, out_features=10, bias=True
    (activation_post_process): MinMaxObserver(min_val=inf, max_val=-inf)
  )
)

In [None]:
acc_quantized = validate_model(cnn_quantized, val_loader, device, desc='Val Quantized')
lat_quantized = measure_inference_time(cnn_quantized, val_loader, device)
model_size_quantized = get_model_size(cnn_quantized)

print(f"FP32 Accuracy: {acc_quantized:.2f}%")
print(f"FP32 Latency:  {lat_quantized:.2f} ms/batch")
print(f"FP32 Size:  {model_size_quantized:.5f} MB")

                                                                

FP32 Accuracy: 71.80%
FP32 Latency:  6.13 ms/batch
FP32 Size:  0.24 MB


#### Conversion

In [14]:
cnn_quantized
cnn_quantized = torch.ao.quantization.convert(cnn_quantized)
cnn_quantized

CNN_OBS(
  (quant): Quantize(scale=tensor([0.0324]), zero_point=tensor([61]), dtype=torch.quint8)
  (dequant): DeQuantize()
  (conv): Sequential(
    (0): QuantizedConv2d(3, 32, kernel_size=(3, 3), stride=(1, 1), scale=0.18029893934726715, zero_point=66, padding=(1, 1))
    (1): ReLU()
    (2): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (3): QuantizedConv2d(32, 64, kernel_size=(3, 3), stride=(1, 1), scale=0.20834548771381378, zero_point=87, padding=(1, 1))
    (4): ReLU()
    (5): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  )
  (fc): QuantizedLinear(in_features=4096, out_features=10, scale=0.25371870398521423, zero_point=57, qscheme=torch.per_tensor_affine)
)

In [17]:
print("Weights Statistics:\n")
print(torch.int_repr(cnn_quantized.conv[0].weight()))

Weights Statistics:

tensor([[[[  -1,  -43,   31],
          [ -55,    9,   -3],
          [  18,   19,    5]],

         [[  28,   12,   25],
          [ -13,   26,   43],
          [ -13,   31,   19]],

         [[ -29,  -19,    8],
          [ -20,  -50,  -25],
          [ -38,    7,   12]]],


        [[[  49,   43,   15],
          [  -1,  -31,  -15],
          [   0,  -48,  -33]],

         [[  32,   12,  -14],
          [   3,  -39,  -26],
          [  25,   15,    0]],

         [[  51,  -14,   31],
          [  20,  -32,    8],
          [ -20,  -12,    6]]],


        [[[  -7,  -20,  -13],
          [  -9,  -21,  -30],
          [ -48,  -36,  -29]],

         [[ -27,    6,   -3],
          [   3,  -23,    4],
          [ -28,  -18,   19]],

         [[  20,   35,   28],
          [  37,   67,   32],
          [  34,   44,   37]]],


        [[[ -66,  -25,  -60],
          [ -31,   21,  -22],
          [  47,   46,   34]],

         [[ -33,   16,  -13],
          [ -33,   34, 

#### Int8 Inference

In [18]:
acc_int8 = validate_model(cnn_quantized, val_loader, device, desc='Final INT8 Eval')
lat_int8 = measure_inference_time(cnn_quantized, val_loader, device)
model_size_int8 = get_model_size(cnn_quantized)

print(f"Acc INT8     : {acc_int8:.2f}%")
print(f"Latency INT8 : {lat_int8:.2f} ms/batch")
print(f"Size INT8    : {model_size_int8:.2f} MB")

                                                                  

Acc INT8     : 71.33%
Latency INT8 : 2.80 ms/batch
Size INT8    : 0.06 MB


### Resumo

Acho que até aqui fica claro o que está sendo feito. PTQ Estática com calibração MinMax. No experimento 2 quero explorar uma CNN mais robusta, mostrar o efeito do Fold BatchNormalization, o efeito da granularidade por tensor e por canal e o motivo da dificuldade relacionada aos modelos pré-treinados

#### Sobre a fusão de camadas

Durante a inferência, operações como **Conv2d → BatchNorm2d → ReLU** são executadas sequencialmente: cada operação lê da memória, computa e escreve de volta. A ideia de realizar uma *fusão de camadas* combina essas operações em um único kernel, eliminando as escritas/leituras intermediárias. Os efeitos principais são:

- Latência menor — menos viagens à memória cache/RAM.  
- Sem perda de acurácia — matematicamente equivalente ao pipeline original (em eval mode).  

Ao fundir Conv+BN antes de quantizar, o quantizador vê uma distribuição de pesos mais estreita, melhorando a resolução INT8.

Importante lembrar que a fusão só é possível em `model.eval()`. Em modo treino o BN ainda atualiza running stats, então a fusão seria incorreta.

#### Fold BN (Batch Normalization Folding)

Fluxo típico:

1. Treinamento normal com
- Conv → BatchNorm → ReLU
2. Modelo entra em modo eval
- (usa média e variância fixas da BN)
3. Fold BatchNorm dentro da Convolution
- A BN é incorporada nos pesos e bias da Conv
- A camada BN é removida do grafo
4. Quantização (PTQ ou QAT)
- Agora o modelo já está como: Conv(folded) → ReLU
- Pesos e ativações são quantizados

A BatchNorm em modo inferência é só uma transformação linear, para cada canal de saída $c$:

$$
y_c = \gamma_c \cdot \frac{x_c - \mu_c}{\sqrt{\sigma_c^2 + \varepsilon}} + \beta_c
$$

onde:

- $\mu_c = \text{running\_mean}_c$
- $\sigma_c^2 = \text{running\_var}_c$

Convolução antes do folding

$$
x_c = W_c * a + b_c
$$

onde:

- $W_c$ = pesos do canal de saída $c$  
- $b_c$ = bias do canal $c$  
- $a$ = ativação de entrada  

Substituindo Conv dentro da BN

$$
y_c =
\gamma_c
\frac{(W_c * a + b_c) - \mu_c}{\sqrt{\sigma_c^2 + \varepsilon}}
+ \beta_c
$$

Fatorando:

$$
y_c =
\left(
\frac{\gamma_c}{\sqrt{\sigma_c^2 + \varepsilon}}
\right)
W_c * a
+
\left(
\frac{\gamma_c (b_c - \mu_c)}{\sqrt{\sigma_c^2 + \varepsilon}}
+ \beta_c
\right)
$$

Equações do folding

Definindo:

$$
\alpha_c = \frac{\gamma_c}{\sqrt{\sigma_c^2 + \varepsilon}}
$$

#### Pesos fundidos

$$
W'_c = \alpha_c \, W_c
$$

#### Bias fundido

$$
b'_c =
\alpha_c (b_c - \mu_c)
+ \beta_c
$$

Forma vetorial (implementação prática)

$$
W' = W \cdot \frac{\gamma}{\sqrt{\text{running\_var} + \varepsilon}}
$$

$$
b' =
\left(
b - \text{running\_mean}
\right)
\cdot
\frac{\gamma}{\sqrt{\text{running\_var} + \varepsilon}}
+ \beta
$$

A multiplicação é por canal de saída (broadcasting).

Caso a Conv não tenha bias, considere $b = 0$:

$$
b' =
-\mu \cdot \frac{\gamma}{\sqrt{\sigma^2 + \varepsilon}}
+ \beta
$$

Após o folding, a camada BatchNorm desaparece e resta apenas uma Conv com pesos \(W'\) e bias \(b'\), pronta para quantização.

### Experiment 2

#### CNN No Observers

In [19]:
class CNN_NO_OBS(nn.Module):
    def __init__(self, num_classes: int):
        super().__init__()
        self.conv1 = nn.Conv2d(3, 32, 3, padding=1)
        self.bn1   = nn.BatchNorm2d(32)
        self.relu1 = nn.ReLU(inplace=False)

        self.conv2 = nn.Conv2d(32, 64, 3, padding=1)
        self.bn2   = nn.BatchNorm2d(64)
        self.relu2 = nn.ReLU(inplace=False)

        self.conv3 = nn.Conv2d(64, 128, 3, padding=1)
        self.bn3   = nn.BatchNorm2d(128)
        self.relu3 = nn.ReLU(inplace=False)

        self.pool = nn.AdaptiveAvgPool2d((8, 8))
        self.fc   = nn.Linear(128 * 8 * 8, num_classes)

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        x = self.relu1(self.bn1(self.conv1(x)))
        x = F.max_pool2d(x, 2)
        x = self.relu2(self.bn2(self.conv2(x)))
        x = F.max_pool2d(x, 2)
        x = self.relu3(self.bn3(self.conv3(x)))

        x = self.pool(x)
        x = torch.flatten(x, 1)
        x = self.fc(x)
        return x

#### Training

In [21]:
cnn_fp32 = CNN_NO_OBS(num_classes=10).to(device)
cnn_fp32 = train_model(
    cnn_fp32, train_loader, val_loader, device,
    epochs=5, lr=0.01, model_name="experiment2_cnn_fp32"
)

acc_cnn_fp32  = evaluate_model(cnn_fp32, val_loader, device)
time_cnn_fp32 = measure_inference_time(cnn_fp32, val_loader, device)
size_cnn_fp32 = get_model_size(cnn_fp32)

print(f"Results:")
print(f"Acc     : {acc_cnn_fp32:.2f}%")
print(f"Latency : {time_cnn_fp32:.2f} ms/batch")
print(f"Size    : {size_cnn_fp32:.5f} MB")

Training experiment2_cnn_fp32 | Epochs: 5 | LR: 0.01


Epoch 1/5: 100%|██████████| 782/782 [00:30<00:00, 25.30it/s, loss=1.5273]
                                                            

Result Epoch 1: Loss = 1.5273 | Acc = 64.03%


Epoch 2/5: 100%|██████████| 782/782 [00:31<00:00, 24.71it/s, loss=1.0502]
                                                            

Result Epoch 2: Loss = 1.0502 | Acc = 60.80%


Epoch 3/5: 100%|██████████| 782/782 [00:31<00:00, 24.87it/s, loss=0.8913]
                                                            

Result Epoch 3: Loss = 0.8913 | Acc = 71.85%


Epoch 4/5: 100%|██████████| 782/782 [00:31<00:00, 24.69it/s, loss=0.7889]
                                                            

Result Epoch 4: Loss = 0.7889 | Acc = 75.31%


Epoch 5/5: 100%|██████████| 782/782 [00:31<00:00, 25.18it/s, loss=0.7156]
                                                            

Result Epoch 5: Loss = 0.7156 | Acc = 76.63%
Results:
Acc     : 76.63%
Latency : 8.63 ms/batch
Size    : 0.67806 MB


In [22]:
state_dict = torch.load("best_experiment2_cnn_fp32.pth", map_location="cpu", weights_only=True)
cnn_fp32.load_state_dict(state_dict)
cnn_fp32.eval()

print("Weights Structure:")
for name, param in cnn_fp32.named_parameters():
    print(f"{name} -> {param.shape}")
    
print('='*42)

print("Weights Statistics:\n")

for name, param in cnn_fp32.named_parameters():
    print(f"{name}:")
    print(param.data.view(-1)[:10])
    print(f"Dtype: {param.dtype}")
    print(f"Shape: {tuple(param.shape)}")
    print(f"Mean: {param.data.mean().item():.6f}")
    print(f"Std:  {param.data.std().item():.6f}")
    print('-'*42)
  
print('Datatype model:')  
print(next(cnn_fp32.parameters()).dtype)

Weights Structure:
conv1.weight -> torch.Size([32, 3, 3, 3])
conv1.bias -> torch.Size([32])
bn1.weight -> torch.Size([32])
bn1.bias -> torch.Size([32])
conv2.weight -> torch.Size([64, 32, 3, 3])
conv2.bias -> torch.Size([64])
bn2.weight -> torch.Size([64])
bn2.bias -> torch.Size([64])
conv3.weight -> torch.Size([128, 64, 3, 3])
conv3.bias -> torch.Size([128])
bn3.weight -> torch.Size([128])
bn3.bias -> torch.Size([128])
fc.weight -> torch.Size([10, 8192])
fc.bias -> torch.Size([10])
Weights Statistics:

conv1.weight:
tensor([ 0.0483,  0.0565,  0.1348, -0.1159, -0.1469, -0.1909,  0.1750, -0.2315,
        -0.1932,  0.1487])
Dtype: torch.float32
Shape: (32, 3, 3, 3)
Mean: 0.000287
Std:  0.192709
------------------------------------------
conv1.bias:
tensor([ 0.1551,  0.0933, -0.1757,  0.1327,  0.1873,  0.0394,  0.0653,  0.0367,
        -0.0382, -0.1751])
Dtype: torch.float32
Shape: (32,)
Mean: -0.006533
Std:  0.101757
------------------------------------------
bn1.weight:
tensor([0.9763, 

#### CNN Observers

In [23]:
class CNN_OBS(nn.Module):
    def __init__(self, num_classes: int):
        super().__init__()
        self.quant   = QuantStub()
        self.dequant = DeQuantStub()

        self.conv1 = nn.Conv2d(3, 32, 3, padding=1)
        self.bn1   = nn.BatchNorm2d(32)
        self.relu1 = nn.ReLU(inplace=False)

        self.conv2 = nn.Conv2d(32, 64, 3, padding=1)
        self.bn2   = nn.BatchNorm2d(64)
        self.relu2 = nn.ReLU(inplace=False)

        self.conv3 = nn.Conv2d(64, 128, 3, padding=1)
        self.bn3   = nn.BatchNorm2d(128)
        self.relu3 = nn.ReLU(inplace=False)

        self.pool = nn.AdaptiveAvgPool2d((8, 8))
        self.fc   = nn.Linear(128 * 8 * 8, num_classes)

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        x = self.quant(x)

        x = self.relu1(self.bn1(self.conv1(x)))
        x = F.max_pool2d(x, 2)
        x = self.relu2(self.bn2(self.conv2(x)))
        x = F.max_pool2d(x, 2)
        x = self.relu3(self.bn3(self.conv3(x)))

        x = self.pool(x)
        x = torch.flatten(x, 1)
        x = self.fc(x)
        x = self.dequant(x)
        return x

    CNN_FUSION_PATTERNS = [
        ["conv1", "bn1", "relu1"],
        ["conv2", "bn2", "relu2"],
        ["conv3", "bn3", "relu3"]
    ]

#### Folding BatchNormalization

In [24]:
def _find_conv_bn_relu_sequences(module: nn.Module) -> List[List[str]]:
    children = list(module.named_children())
    patterns: List[List[str]] = []
    i = 0
    while i < len(children):
        n0, m0 = children[i]
        if isinstance(m0, nn.Conv2d) and i + 1 < len(children):
            n1, m1 = children[i + 1]
            if isinstance(m1, nn.BatchNorm2d):
                if i + 2 < len(children):
                    n2, m2 = children[i + 2]
                    if isinstance(m2, (nn.ReLU, nn.ReLU6)):
                        patterns.append([n0, n1, n2])
                        i += 3
                        continue
                patterns.append([n0, n1])
                i += 2
                continue
        i += 1
    return patterns

def fuse_bn_recursively(model: nn.Module) -> nn.Module:
    assert not model.training, "Call model.eval() before BN folding"
    for _, child in model.named_children():
        fuse_bn_recursively(child)
    patterns = _find_conv_bn_relu_sequences(model)
    if patterns:
        torch.quantization.fuse_modules(model, patterns, inplace=True)

    return model

In [25]:
def apply_ptq(model: nn.Module, calibration_loader, mode: Literal["per_tensor", "per_channel"] = "per_channel",
            fold_bn: bool = False, num_calibration_batches: int = 10, custom_fusion_patterns: Optional[List[List[str]]] = None) -> nn.Module:
    
    backend = "fbgemm"
    torch.backends.quantized.engine = backend

    model = copy.deepcopy(model).cpu().eval()

    if fold_bn:
        if custom_fusion_patterns is not None:
            torch.quantization.fuse_modules(
                model, custom_fusion_patterns, inplace=True
            )
        else:
            fuse_bn_recursively(model)

    if mode == "per_channel":
        qconfig = torch.quantization.get_default_qconfig(backend)
    else:
        qconfig = torch.quantization.QConfig(
            activation=torch.quantization.MinMaxObserver.with_args(
                dtype=torch.quint8, qscheme=torch.per_tensor_affine
            ),
            weight=torch.quantization.MinMaxObserver.with_args(
                dtype=torch.qint8, qscheme=torch.per_tensor_symmetric
            ),
        )

    model.qconfig = qconfig
    model_prepared = torch.quantization.prepare(model, inplace=False)
    with torch.no_grad():
        for i, (data, _) in enumerate(calibration_loader):
            if i >= num_calibration_batches:
                break
            model_prepared(data.cpu())

    model_quantized = torch.quantization.convert(model_prepared, inplace=False)
    return model_quantized

In [28]:
cnn_fused = copy.deepcopy(cnn_fp32).cpu().eval()
torch.quantization.fuse_modules(cnn_fused, CNN_OBS.CNN_FUSION_PATTERNS, inplace=True)

lat_fused  = measure_inference_time(cnn_fused, val_loader, device)
acc_fused  = evaluate_model(cnn_fused, val_loader, device)
size_fused = get_model_size(cnn_fused)

print(f"{'Model':<30} {'Acc':>7}  {'Latency':>10}  {'Size':>8}")
print("="*60)
print(f"{'CNN FP32 without layer fusion':<30} {acc_cnn_fp32:>6.2f}%  {time_cnn_fp32:>8.2f}ms  {size_cnn_fp32:>6.3f}MB")
print(f"{'CNN FP32 with layer fusion':<30} {acc_fused:>6.2f}%  {lat_fused:>8.2f}ms  {size_fused:>6.3f}MB")

Model                              Acc     Latency      Size
CNN FP32 without layer fusion   76.63%      8.63ms   0.678MB
CNN FP32 with layer fusion      76.63%      7.86ms   0.671MB


In [None]:
torch.backends.quantized.engine = "fbgemm"

cnn_quant_base = CNNOBS(num_classes=10)
cnn_quant_base.load_state_dict(torch.load("best_cnn_fp32.pth", map_location="cpu", weights_only=True))

print("Quantizando: per_tensor ...")
cnn_pt = apply_ptq(cnn_quant_base, val_loader, mode="per_tensor",  fold_bn=False, custom_fusion_patterns=CNN.CNN_FUSION_PATTERNS)

print("Quantizando: per_tensor + fold_bn ...")
cnn_pt_fold = apply_ptq(cnn_quant_base, val_loader, mode="per_tensor",  fold_bn=True, custom_fusion_patterns=CNN.CNN_FUSION_PATTERNS)

print("Quantizando: per_channel ...")
cnn_pc = apply_ptq(cnn_quant_base, val_loader, mode="per_channel", fold_bn=False, custom_fusion_patterns=CNN.CNN_FUSION_PATTERNS)

print("Quantizando: per_channel + fold_bn ...")
cnn_pc_fold = apply_ptq(cnn_quant_base, val_loader, mode="per_channel", fold_bn=True, custom_fusion_patterns=CNN.CNN_FUSION_PATTERNS)

In [None]:
def eval_all(model, name):
    acc  = evaluate_model(model, val_loader, device)
    lat  = measure_inference_time(model, val_loader, device)
    size = get_model_size(model)
    print(f"{name:<35} acc={acc:.2f}%  lat={lat:.2f}ms  size={size:.3f}MB")
    return acc, lat, size

print(f"{'Modelo':<35} {'Acc':>8}  {'Latência':>10}  {'Tamanho':>9}")
print("-" * 70)
acc_fp32_cnn,  lat_fp32_cnn,  size_fp32_cnn  = eval_all(cnn_fp32,      "CNN FP32")
acc_pt,        lat_pt,        size_pt         = eval_all(cnn_pt,       "CNN per_tensor")
acc_pt_fold,   lat_pt_fold,   size_pt_fold    = eval_all(cnn_pt_fold,  "CNN per_tensor + fold_bn")
acc_pc,        lat_pc,        size_pc         = eval_all(cnn_pc,       "CNN per_channel")
acc_pc_fold,   lat_pc_fold,   size_pc_fold    = eval_all(cnn_pc_fold,  "CNN per_channel + fold_bn")

In [None]:
imagenet_transform = T.Compose([
    T.Resize(256),
    T.CenterCrop(224),
    T.ToTensor(),
    T.Normalize(mean=[0.485, 0.456, 0.406],
                std =[0.229, 0.224, 0.225]),
])

# Recria o val loader com transforms de ImageNet para avaliação dos modelos pré-treinados
import torchvision.datasets as dsets
cifar10_imagenet_val = dsets.CIFAR10(
    "../data", train=False, download=True, transform=imagenet_transform
)
imagenet_val_loader = torch.utils.data.DataLoader(
    cifar10_imagenet_val, batch_size=64, shuffle=False, num_workers=4, pin_memory=True
)

def evaluate_pretrained_variants(model_name: str):
    """
    Carrega, quantiza e avalia um modelo pré-treinado nas 4 variantes PTQ.
    Retorna dict com acc, lat, size para fp32 e cada variante.
    """
    print(f"\n{'='*60}")
    print(f"  Modelo: {model_name}")
    print(f"{'='*60}")

    torch.backends.quantized.engine = "fbgemm"

    # FP32 baseline
    model_fp32 = prepare_pretrained_model(model_name, num_classes=1000, wrap=False)
    model_fp32.eval().cpu()

    acc_fp32_ = evaluate_model(model_fp32, imagenet_val_loader, device)
    lat_fp32_ = measure_inference_time(model_fp32, imagenet_val_loader, device, num_batches=10)
    sz_fp32_  = get_model_size(model_fp32)
    print(f"  FP32       acc={acc_fp32_:.2f}%  lat={lat_fp32_:.1f}ms  size={sz_fp32_:.1f}MB")

    # Para quantização precisamos do wrapper
    base_wrapped = prepare_pretrained_model(model_name, num_classes=1000, wrap=True)

    variants = {
        "per_tensor":         dict(mode="per_tensor",  fold_bn=False),
        "per_tensor_foldbn":  dict(mode="per_tensor",  fold_bn=True),
        "per_channel":        dict(mode="per_channel", fold_bn=False),
        "per_channel_foldbn": dict(mode="per_channel", fold_bn=True),
    }

    row = {
        "model": model_name,
        "fp32_acc": acc_fp32_, "fp32_lat": lat_fp32_, "fp32_size": sz_fp32_,
    }

    for tag, kwargs in variants.items():
        q_model = apply_ptq(base_wrapped, imagenet_val_loader,
                            num_calibration_batches=10, **kwargs)
        acc_ = evaluate_model(q_model, imagenet_val_loader, device)
        lat_ = measure_inference_time(q_model, imagenet_val_loader, device, num_batches=10)
        sz_  = get_model_size(q_model)
        print(f"  {tag:<22} acc={acc_:.2f}%  lat={lat_:.1f}ms  size={sz_:.1f}MB")
        row[f"{tag}_acc"]  = acc_
        row[f"{tag}_lat"]  = lat_
        row[f"{tag}_size"] = sz_

    return row

# ── Executar para todos os modelos pré-treinados ──────────────────────────────
pretrained_names = ["mobilenet_v2", "resnet50", "resnext50_32x4d", "efficientnet_b0"]
pretrained_results = []

for name in pretrained_names:
    row = evaluate_pretrained_variants(name)
    pretrained_results.append(row)

print("\nQuantização de modelos pré-treinados concluída!")


In [None]:
from tabulate import tabulate

# ── Tabela 1: Modelo | Variante | Latência (ms) | Tamanho (MB) ────────────────
rows_t1 = []

# CNN (custom)
cnn_variants_t1 = [
    ("CNN",  "FP32",                   lat_fp32_cnn,  size_fp32_cnn),
    ("CNN",  "per_tensor",             lat_pt,        size_pt),
    ("CNN",  "per_tensor + fold_bn",   lat_pt_fold,   size_pt_fold),
    ("CNN",  "per_channel",            lat_pc,        size_pc),
    ("CNN",  "per_channel + fold_bn",  lat_pc_fold,   size_pc_fold),
]
rows_t1.extend(cnn_variants_t1)

# Modelos pré-treinados
for r in pretrained_results:
    m = r["model"]
    rows_t1.append((m, "FP32",                   r["fp32_lat"],                r["fp32_size"]))
    rows_t1.append((m, "per_tensor",             r["per_tensor_lat"],          r["per_tensor_size"]))
    rows_t1.append((m, "per_tensor + fold_bn",   r["per_tensor_foldbn_lat"],   r["per_tensor_foldbn_size"]))
    rows_t1.append((m, "per_channel",            r["per_channel_lat"],         r["per_channel_size"]))
    rows_t1.append((m, "per_channel + fold_bn",  r["per_channel_foldbn_lat"],  r["per_channel_foldbn_size"]))

print("\n" + "="*65)
print("TABELA 1 — Latência e Tamanho dos Modelos")
print("="*65)
print(tabulate(
    [[m, v, f"{lat:.2f}", f"{sz:.3f}"] for m, v, lat, sz in rows_t1],
    headers=["Modelo", "Variante", "Latência (ms/batch)", "Tamanho (MB)"],
    tablefmt="fancy_grid"
))

# ── Tabela 2: Modelo | FP32 | per_tensor | per_tensor_fold | per_channel | per_channel_fold ──
rows_t2 = []

# CNN
rows_t2.append([
    "CNN",
    f"{acc_fp32_cnn:.2f}",
    f"{acc_pt:.2f}",
    f"{acc_pt_fold:.2f}",
    f"{acc_pc:.2f}",
    f"{acc_pc_fold:.2f}",
])

# Pré-treinados
for r in pretrained_results:
    rows_t2.append([
        r["model"],
        f"{r['fp32_acc']:.2f}",
        f"{r['per_tensor_acc']:.2f}",
        f"{r['per_tensor_foldbn_acc']:.2f}",
        f"{r['per_channel_acc']:.2f}",
        f"{r['per_channel_foldbn_acc']:.2f}",
    ])

print("\n" + "="*90)
print("TABELA 2 — Acurácia (%) por Estratégia de Quantização")
print("="*90)
print(tabulate(
    rows_t2,
    headers=[
        "Modelo", "FP32 (%)", "per_tensor (%)",
        "per_tensor\n+fold_bn (%)", "per_channel (%)",
        "per_channel\n+fold_bn (%)"
    ],
    tablefmt="fancy_grid"
))

print("\n[Legenda]")
print("  fold_bn   = BatchNorm absorvido nos pesos Conv antes da quantização")
print("  per_tensor = uma escala por tensor inteiro (pesos + ativações)")
print("  per_channel= uma escala por canal de saída (pesos) — padrão fbgemm")


#### Wrapper

In [None]:
class QuantizableWrapper(nn.Module):
    def __init__(self, model: nn.Module):
        super().__init__()
        self.quant   = QuantStub()
        self.model   = model
        self.dequant = DeQuantStub()

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        x = self.quant(x)
        x = self.model(x)
        x = self.dequant(x)
        return x


def prepare_pretrained_model(model_name: str, num_classes: int = 1000, wrap: bool = True) -> nn.Module:
    weights_map = {
        "mobilenet_v2":      (models.mobilenet_v2,       models.MobileNet_V2_Weights.IMAGENET1K_V2),
        "mobilenet_v3_small":(models.mobilenet_v3_small, models.MobileNet_V3_Small_Weights.IMAGENET1K_V1),
        "efficientnet_b0":   (models.efficientnet_b0,    models.EfficientNet_B0_Weights.IMAGENET1K_V1),
        "resnet50":          (models.resnet50,           models.ResNet50_Weights.IMAGENET1K_V2),
        "resnext50_32x4d":   (models.resnext50_32x4d,   models.ResNeXt50_32X4D_Weights.IMAGENET1K_V2),
    }

    if model_name not in weights_map:
        raise ValueError(f"Model '{model_name}' not supported. Choose from: {list(weights_map)}")

    constructor, weights = weights_map[model_name]
    model = constructor(weights=weights)

    if num_classes != 1000:
        if model_name == "mobilenet_v2":
            model.classifier[1] = nn.Linear(model.last_channel, num_classes)
        elif model_name == "mobilenet_v3_small":
            model.classifier[3] = nn.Linear(model.classifier[3].in_features, num_classes)
        elif model_name == "efficientnet_b0":
            model.classifier[1] = nn.Linear(model.classifier[1].in_features, num_classes)
        elif model_name in ("resnet50", "resnext50_32x4d"):
            model.fc = nn.Linear(model.fc.in_features, num_classes)

    return QuantizableWrapper(model) if wrap else model