In [1]:
import torch, time
import torch.nn as nn
import torch.optim as optim

from src.utils import *
from src.override_resnet import *


class Args:
    arch = 50
    dataset = "ImageNet"
    # dataset = "CIFAR100"
    lr = 0.001
    momentum = 0.9
    batch = 16
    epochs = 10
    save_every = 1
    quan = "static"
    only_eval = True
    verbose = True


args = Args()

In [2]:
def fuse_model(model) -> nn.Module:
    flag = False
    for m in model.modules():
        if m.__class__.__name__ == ResNet_quan.__name__:
            if flag == True:
                raise ValueError("ResNet_quan is already fused")
            flag = True
            torch.quantization.fuse_modules(
                m,
                ["conv1", "bn1", "relu"],
                inplace=True,
            )

        if type(m) == BottleNeck_quan:
            torch.quantization.fuse_modules(
                m,
                [
                    ["conv1", "bn1", "relu1"],
                    ["conv2", "bn2", "relu2"],
                    ["conv3", "bn3"],
                ],
                inplace=True,
            )
            if m.downsample is not None:
                torch.quantization.fuse_modules(
                    m.downsample,
                    ["0", "1"],
                    inplace=True,
                )
    return model

In [3]:
# %% my code

args = Args()
# %% Load the ResNet-50 model
if args.quan == "fp32":
    # case 0 : no quantization case
    print("----------No quantization enabled")
    device = str(torch.device("cuda:0" if torch.cuda.is_available() else "cpu"))
    model = layers_mapping[args.arch](
        weights=pretrained_weights_mapping[args.arch]
    ).to(device)

elif args.quan == "dynamic":
    # case 1 : Dynamic Quantization
    print("----------Dynamic Quantization enabled")
    device = "cuda"
    model = resnet50_quan(weights=pretrained_weights_mapping[args.arch]).to(device)
    quantized_model = torch.quantization.quantize_dynamic(
        model, {torch.nn.Linear}, dtype=torch.qint8
    )
    model = quantized_model

elif args.quan == "static":
    # case 2 : Static Quantization
    print("----------Static Quantization enabled")
    device = "cpu"
    model = resnet50_quan(weights=pretrained_weights_mapping[args.arch]).to(device)

elif args.quan == "qat":
    # case 3 : Quantization Aware Training
    print("----------Quantization Aware Training enabled")
else:
    raise ValueError("Invalid quantization method")

# _folder_path = f"resnet{args.arch}_{args.dataset}" + "_" + args.quan
# _file_name = (
#     f"resnet{args.arch}_{args.dataset}_epoch"  # resnet18_cifar10_epoch{epoch}.pth
# )


----------Static Quantization enabled


# 1. The Acc of Reference Network

- Check the origin network architecture

In [4]:
print(model.layer1[0])

BottleNeck_quan(
  (conv1): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
  (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
  (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
  (bn3): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (relu): ReLU(inplace=True)
  (downsample): Sequential(
    (0): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
    (1): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  )
  (relu1): ReLU()
  (relu2): ReLU()
  (relu3): ReLU()
  (add): FloatFunctional(
    (activation_post_process): Identity()
  )
)


In [5]:
# check_accuracy(model=model, device="cpu", batch_size=25)
print("Post Training Quantization: Eval done")

Post Training Quantization: Eval done


- Check the fused network architecture

In [6]:
model.eval()
model = fuse_model(model)
print(print_size_of_model(model))
print(model.layer1[0])

Size (MB): 102.158986
102.158986
BottleNeck_quan(
  (conv1): ConvReLU2d(
    (0): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1))
    (1): ReLU()
  )
  (bn1): Identity()
  (conv2): ConvReLU2d(
    (0): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (1): ReLU()
  )
  (bn2): Identity()
  (conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1))
  (bn3): Identity()
  (relu): ReLU(inplace=True)
  (downsample): Sequential(
    (0): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1))
    (1): Identity()
  )
  (relu1): Identity()
  (relu2): Identity()
  (relu3): ReLU()
  (add): FloatFunctional(
    (activation_post_process): Identity()
  )
)


# 2. Calibration for Post-Training Static Quantization

- Check the Quantization Configuration

In [8]:
# QConfig(activation=functools.partial(<class 'torch.ao.quantization.observer.HistogramObserver'>, reduce_range=True){},
#         weight=functools.partial(<class 'torch.ao.quantization.observer.PerChannelMinMaxObserver'>, dtype=torch.qint8, qscheme=torch.per_channel_symmetric){})


model.qconfig = torch.quantization.get_default_qconfig("x86")
# model.qconfig = torch.quantization.QConfig(
#     activation=torch.quantization.observer.HistogramObserver.with_args(
#         reduce_range=True
#     ),
#     weight=torch.quantization.observer.PerChannelMinMaxObserver.with_args(qscheme=torch.per_channel_symmetric),
# )
print(model.qconfig)


QConfig(activation=functools.partial(<class 'torch.ao.quantization.observer.HistogramObserver'>, reduce_range=True){}, weight=functools.partial(<class 'torch.ao.quantization.observer.PerChannelMinMaxObserver'>, dtype=torch.qint8, qscheme=torch.per_channel_symmetric){})


In [9]:
# 첫 번째 Convolutional 레이어를 찾습니다.
first_conv_name = None
for name, module in model.named_modules():
    if isinstance(module, torch.nn.Conv2d):
        first_conv_name = name
        break

# 첫 번째 Convolutional 레이어에 대해 NoopObserver를 사용하고, 나머지 레이어에 대해 기본 observer를 사용하도록 설정합니다.
prepare_custom_config_dict = {
    "object_type": [
        (
            torch.nn.Conv2d,
            {
                "qconfig": torch.quantization.get_default_qconfig("x86"),
                # "observer_type": torch.quantization.HistogramObserver,
            },
        ),
        (
            torch.nn.Linear,
            {
                "qconfig": torch.quantization.get_default_qconfig("x86"),
                # "observer_type": torch.quantization.HistogramObserver,
            },
        ),
    ],
    "module_name": [
        (
            first_conv_name,
            {
                "qconfig": torch.quantization.get_default_qconfig("x86"),
                "observer_type": torch.quantization.NoopObserver,
            },
        )
    ],
}

torch.quantization.prepare(model, inplace=True, prepare_custom_config_dict=prepare_custom_config_dict)

print("Post Training Quantization Prepare: Inserting Observers")

Post Training Quantization Prepare: Inserting Observers




In [10]:
# torch.quantization.prepare(model, inplace=True)

# print("Post Training Quantization Prepare: Inserting Observers")

- Inference with the representative dataset (calculate the quantization parameters)

In [11]:
criterion = nn.CrossEntropyLoss()
train_loader, test_loader = GetDataset(
    dataset_name=args.dataset,
    device=device,
    root="data",
    batch_size=256,
    num_workers=8,
)
_, _ = SingleEpochEval(model, train_loader, criterion, "cuda", 1000)
print("Post Training Quantization: Calibration done")

 20%|█▉        | 999/5005 [16:47<1:07:20,  1.01s/it]

Post Training Quantization: Calibration done





- Convert to quantized model

In [12]:
device = "cpu"
model.to(device)
torch.quantization.convert(model, inplace=True)
print("Post Training Quantization: Convert done")

Post Training Quantization: Convert done


# 3. Complete 

In [14]:
check_accuracy(model=model, device="cpu", batch_size=25)
print("Post Training Quantization: Eval done")

Elapsed time:  23 ms
Size (MB): 26.151272


 25%|██▍       | 499/2000 [02:40<08:03,  3.11it/s]

Eval Loss: 1.2497, Eval Acc: 85.41%
Post Training Quantization: Eval done



