In [1]:
import time
import torch
import torchvision
import torchvision.transforms as transforms
from torchvision.datasets import ImageFolder
from torch.utils.data import DataLoader
from torch.quantization import get_default_qconfig, prepare, convert
from torch.quantization import fuse_modules, QuantStub, DeQuantStub
import kagglehub

In [2]:
QUANT_CONFIG_NAME = 'fbgemm' # fbgemm: x86 CPU (Intel/AMD) || qnnpack: ARM CPU (M1 Mac, Raspberry Pi,모바일 기기 등)

torch.backends.quantized.engine = QUANT_CONFIG_NAME

### CIFAR10 데이터 갖고오기 및 DataLoader 생성

In [3]:
# Load dataset
path = kagglehub.dataset_download("sautkin/imagenet1kvalid")

transform = transforms.Compose([
    transforms.Resize(256),
    transforms.CenterCrop(224),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406],
                         std=[0.229, 0.224, 0.225]),
])

test_dataset = ImageFolder(root=path, transform=transform)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=True)

### 양자화 할 pretrained ResNet50 모델 불러오기

In [4]:
model = torchvision.models.resnet50(pretrained=True)
torch.save(model.state_dict(), "resnet50_full.pth")

model.eval()
model.to('cpu')  # 정적 양자화는 CPU에서 실행 (fbgemm backend를 사용하여 cpu에 최적화된 양자화를 하기 위해)

model



ResNet(
  (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
  (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (relu): ReLU(inplace=True)
  (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
  (layer1): Sequential(
    (0): Bottleneck(
      (conv1): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn3): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
      (downsample): Sequential(
        (0): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 

### 양자화 진행을 위한 계층 합치기(fusion)

In [5]:
qconfig = get_default_qconfig(QUANT_CONFIG_NAME)
model.qconfig = qconfig

fused_model = model

def fuse_resnet50_modules(model):
    # Top-level conv fuse
    fuse_modules(model, [['conv1', 'bn1', 'relu']], inplace=True)

    # Layer1~4: Bottleneck blocks fuse
    for layer_name in ['layer1', 'layer2', 'layer3', 'layer4']:
        layer = getattr(model, layer_name)
        for bottleneck in layer:
            fuse_modules(bottleneck, [['conv1', 'bn1']], inplace=True)
            fuse_modules(bottleneck, [['conv2', 'bn2']], inplace=True)
            fuse_modules(bottleneck, [['conv3', 'bn3', 'relu']], inplace=True) # relu 는 블록의 마지막에 단독으로 존재하므로 포함하지 않음. by ChatGPT

            if hasattr(bottleneck, 'downsample') and bottleneck.downsample is not None:
                fuse_modules(bottleneck.downsample, [['0', '1']], inplace=True)

#torch.quantization.convert() 호출시 Linear 계층은 자동으로 양자화 됨.
fuse_resnet50_modules(fused_model)

class QuantizedResNetWrapper(torch.nn.Module):
    def __init__(self, model):
        super().__init__()
        self.quant = QuantStub()
        self.model_fp32 = model
        self.dequant = DeQuantStub()
        
    def forward(self, x):
        x = self.quant(x)
        x = self.model_fp32(x)
        x = self.dequant(x)
        return x

fused_model = QuantizedResNetWrapper(fused_model) #모델에 '입력 양자화', '출력 실수화' 를 적용

fused_model

QuantizedResNetWrapper(
  (quant): QuantStub()
  (model_fp32): ResNet(
    (conv1): ConvReLU2d(
      (0): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3))
      (1): ReLU(inplace=True)
    )
    (bn1): Identity()
    (relu): Identity()
    (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
    (layer1): Sequential(
      (0): Bottleneck(
        (conv1): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1))
        (bn1): Identity()
        (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
        (bn2): Identity()
        (conv3): ConvReLU2d(
          (0): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1))
          (1): ReLU(inplace=True)
        )
        (bn3): Identity()
        (relu): Identity()
        (downsample): Sequential(
          (0): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1))
          (1): Identity()
        )
      )
      (1): Bottleneck(
        (conv1): Conv2d(256, 64, kernel_size=(1

### 모델 양자화

In [6]:
prepared_model = prepare(fused_model)

def calibrate(model, loader): # 정적 양자화에서 필요한 scale/zero_point를 추정하기 위해 calibration 진행
    model.eval()
    with torch.no_grad():
        for i, (images, _) in enumerate(loader):
            if i > 10: break  # 일부 input 만 사용
            model(images)

calibrate(prepared_model, test_loader)

quantized_model = convert(prepared_model)

torch.save(quantized_model.state_dict(), "resnet50_quantized_full.pt") #모댈 구조 + parameter 저장



### 양자화 모델 평가

In [7]:
def evaluate(model, loader):
    correct = 0
    total = 0
    start = time.time()
    with torch.no_grad():
        for i, (images, labels) in enumerate(loader):
            if i >= 5: 
                break
            outputs = model(images)
            _, preds = torch.max(outputs.data, 1)
            correct += (preds == labels).sum().item()
            total += labels.size(0)
    end = time.time()
    acc = 100 * correct / total
    latency = (end - start) / len(loader)
    return acc, latency

acc, latency = evaluate(model, test_loader)
print(f"[Origin]Accuracy: {acc:.2f}%, Latency: {latency:.4f}s per 5_batch")

[Origin]Accuracy: 0.62%, Latency: 0.0039s per batch


In [27]:
quantized_acc, quantized_latency = evaluate(quantized_model, test_loader)
print(f"[Quantized]Accuracy: {quantized_acc:.2f}%, Latency: {quantized_latency:.4f}s per batch")

NotImplementedError: Could not run 'quantized::conv2d_relu.new' with arguments from the 'CPU' backend. This could be because the operator doesn't exist for this backend, or was omitted during the selective/custom build process (if using custom build). If you are a Facebook employee using PyTorch on mobile, please visit https://fburl.com/ptmfixes for possible resolutions. 'quantized::conv2d_relu.new' is only available for these backends: [Meta, QuantizedCPU, BackendSelect, Python, FuncTorchDynamicLayerBackMode, Functionalize, Named, Conjugate, Negative, ZeroTensor, ADInplaceOrView, AutogradOther, AutogradCPU, AutogradCUDA, AutogradXLA, AutogradMPS, AutogradXPU, AutogradHPU, AutogradLazy, AutogradMTIA, AutogradMeta, Tracer, AutocastCPU, AutocastMTIA, AutocastXPU, AutocastMPS, AutocastCUDA, FuncTorchBatched, BatchedNestedTensor, FuncTorchVmapMode, Batched, VmapMode, FuncTorchGradWrapper, PythonTLSSnapshot, FuncTorchDynamicLayerFrontMode, PreDispatch, PythonDispatcher].

Meta: registered at C:\actions-runner\_work\pytorch\pytorch\pytorch\aten\src\ATen\core\MetaFallbackKernel.cpp:23 [backend fallback]
QuantizedCPU: registered at C:\actions-runner\_work\pytorch\pytorch\pytorch\aten\src\ATen\native\quantized\cpu\qconv.cpp:2044 [kernel]
BackendSelect: fallthrough registered at C:\actions-runner\_work\pytorch\pytorch\pytorch\aten\src\ATen\core\BackendSelectFallbackKernel.cpp:3 [backend fallback]
Python: registered at C:\actions-runner\_work\pytorch\pytorch\pytorch\aten\src\ATen\core\PythonFallbackKernel.cpp:194 [backend fallback]
FuncTorchDynamicLayerBackMode: registered at C:\actions-runner\_work\pytorch\pytorch\pytorch\aten\src\ATen\functorch\DynamicLayer.cpp:479 [backend fallback]
Functionalize: registered at C:\actions-runner\_work\pytorch\pytorch\pytorch\aten\src\ATen\FunctionalizeFallbackKernel.cpp:349 [backend fallback]
Named: registered at C:\actions-runner\_work\pytorch\pytorch\pytorch\aten\src\ATen\core\NamedRegistrations.cpp:7 [backend fallback]
Conjugate: registered at C:\actions-runner\_work\pytorch\pytorch\pytorch\aten\src\ATen\ConjugateFallback.cpp:17 [backend fallback]
Negative: registered at C:\actions-runner\_work\pytorch\pytorch\pytorch\aten\src\ATen\native\NegateFallback.cpp:18 [backend fallback]
ZeroTensor: registered at C:\actions-runner\_work\pytorch\pytorch\pytorch\aten\src\ATen\ZeroTensorFallback.cpp:86 [backend fallback]
ADInplaceOrView: fallthrough registered at C:\actions-runner\_work\pytorch\pytorch\pytorch\aten\src\ATen\core\VariableFallbackKernel.cpp:100 [backend fallback]
AutogradOther: registered at C:\actions-runner\_work\pytorch\pytorch\pytorch\aten\src\ATen\core\VariableFallbackKernel.cpp:63 [backend fallback]
AutogradCPU: registered at C:\actions-runner\_work\pytorch\pytorch\pytorch\aten\src\ATen\core\VariableFallbackKernel.cpp:67 [backend fallback]
AutogradCUDA: registered at C:\actions-runner\_work\pytorch\pytorch\pytorch\aten\src\ATen\core\VariableFallbackKernel.cpp:75 [backend fallback]
AutogradXLA: registered at C:\actions-runner\_work\pytorch\pytorch\pytorch\aten\src\ATen\core\VariableFallbackKernel.cpp:83 [backend fallback]
AutogradMPS: registered at C:\actions-runner\_work\pytorch\pytorch\pytorch\aten\src\ATen\core\VariableFallbackKernel.cpp:91 [backend fallback]
AutogradXPU: registered at C:\actions-runner\_work\pytorch\pytorch\pytorch\aten\src\ATen\core\VariableFallbackKernel.cpp:71 [backend fallback]
AutogradHPU: registered at C:\actions-runner\_work\pytorch\pytorch\pytorch\aten\src\ATen\core\VariableFallbackKernel.cpp:104 [backend fallback]
AutogradLazy: registered at C:\actions-runner\_work\pytorch\pytorch\pytorch\aten\src\ATen\core\VariableFallbackKernel.cpp:87 [backend fallback]
AutogradMTIA: registered at C:\actions-runner\_work\pytorch\pytorch\pytorch\aten\src\ATen\core\VariableFallbackKernel.cpp:79 [backend fallback]
AutogradMeta: registered at C:\actions-runner\_work\pytorch\pytorch\pytorch\aten\src\ATen\core\VariableFallbackKernel.cpp:95 [backend fallback]
Tracer: registered at C:\actions-runner\_work\pytorch\pytorch\pytorch\torch\csrc\autograd\TraceTypeManual.cpp:294 [backend fallback]
AutocastCPU: fallthrough registered at C:\actions-runner\_work\pytorch\pytorch\pytorch\aten\src\ATen\autocast_mode.cpp:322 [backend fallback]
AutocastMTIA: fallthrough registered at C:\actions-runner\_work\pytorch\pytorch\pytorch\aten\src\ATen\autocast_mode.cpp:466 [backend fallback]
AutocastXPU: fallthrough registered at C:\actions-runner\_work\pytorch\pytorch\pytorch\aten\src\ATen\autocast_mode.cpp:504 [backend fallback]
AutocastMPS: fallthrough registered at C:\actions-runner\_work\pytorch\pytorch\pytorch\aten\src\ATen\autocast_mode.cpp:209 [backend fallback]
AutocastCUDA: fallthrough registered at C:\actions-runner\_work\pytorch\pytorch\pytorch\aten\src\ATen\autocast_mode.cpp:165 [backend fallback]
FuncTorchBatched: registered at C:\actions-runner\_work\pytorch\pytorch\pytorch\aten\src\ATen\functorch\LegacyBatchingRegistrations.cpp:731 [backend fallback]
BatchedNestedTensor: registered at C:\actions-runner\_work\pytorch\pytorch\pytorch\aten\src\ATen\functorch\LegacyBatchingRegistrations.cpp:758 [backend fallback]
FuncTorchVmapMode: fallthrough registered at C:\actions-runner\_work\pytorch\pytorch\pytorch\aten\src\ATen\functorch\VmapModeRegistrations.cpp:27 [backend fallback]
Batched: registered at C:\actions-runner\_work\pytorch\pytorch\pytorch\aten\src\ATen\LegacyBatchingRegistrations.cpp:1075 [backend fallback]
VmapMode: fallthrough registered at C:\actions-runner\_work\pytorch\pytorch\pytorch\aten\src\ATen\VmapModeRegistrations.cpp:33 [backend fallback]
FuncTorchGradWrapper: registered at C:\actions-runner\_work\pytorch\pytorch\pytorch\aten\src\ATen\functorch\TensorWrapper.cpp:208 [backend fallback]
PythonTLSSnapshot: registered at C:\actions-runner\_work\pytorch\pytorch\pytorch\aten\src\ATen\core\PythonFallbackKernel.cpp:202 [backend fallback]
FuncTorchDynamicLayerFrontMode: registered at C:\actions-runner\_work\pytorch\pytorch\pytorch\aten\src\ATen\functorch\DynamicLayer.cpp:475 [backend fallback]
PreDispatch: registered at C:\actions-runner\_work\pytorch\pytorch\pytorch\aten\src\ATen\core\PythonFallbackKernel.cpp:206 [backend fallback]
PythonDispatcher: registered at C:\actions-runner\_work\pytorch\pytorch\pytorch\aten\src\ATen\core\PythonFallbackKernel.cpp:198 [backend fallback]


### ONNX모델 파일 생성

In [9]:
x = torch.randn(1, 3, 224, 224)

torch.onnx.export(model, x, "resnet50_full.onnx", opset_version=11)
#torch.onnx.export(quantized_model, x, "resnet50_quantized_full.onnx", opset_version=11)

### ONNX Runtime 사용 최적화 후 Latency 비교

In [25]:
import onnxruntime as ort
import numpy as np

ort_session = ort.InferenceSession("resnet50_full.onnx")

def to_numpy(tensor):
    return tensor.detach().cpu().numpy() if tensor.requires_grad else tensor.cpu().numpy()

# Measure ONNX latency
start = time.time()
for i, (images, _) in enumerate(test_loader):
    if i >= 5:
        break
    images = images.numpy()
    for j in range(image.shape[0]):
        image = images[j]
        image = np.expand_dims(image, axis=0)
        ort_inputs = {"input.1": image}
        ort_outs = ort_session.run(None, ort_inputs)
end = time.time()
onnx_latency = (end - start) / len(test_loader)
print(f"[ONNX Runtime] Latency: {onnx_latency:.4f}s per 5_batch")

[ONNX Runtime] Latency: 0.0007s per 5_batch


#### ONNX runtime 을 사용하여 최적화 하였을때 기존 5배치당 0.0039s 에서 0.0007s 로 약 82프로 latency가 감소한 것을 볼 수 있다.