In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchvision import datasets, transforms

import time

class Net(nn.Module):
  def __init__(self):
    super(Net, self).__init__()
    self.quant = torch.quantization.QuantStub() 
    self.conv1 = nn.Conv2d(1, 32, 3, 1)
    self.conv2 = nn.Conv2d(32, 64, 3, 1)
    self.relu = torch.nn.ReLU()
    self.dropout1= nn.Dropout(0.25)
    self.dropout2 = nn.Dropout(0.5)
    self.fc1 = nn.Linear(9216, 128)
    self.fc2 = nn.Linear(128, 10)
    self.maxpool = nn.MaxPool2d(2)
    self.logsoftmax = torch.nn.LogSoftmax(dim=1)
    self.dequant = torch.quantization.DeQuantStub()


  def forward(self, x):
    x = self.quant(x) # QuanStub를 forward를 시작하는 부분에 적어준다.
    x = self.conv1(x)
    x = self.relu(x)
    x = self.conv2(x)
    x = self.relu(x)
    x = self.maxpool(x)
    x = self.dropout1(x)
    x = torch.flatten(x, 1)
    x = self.fc1(x)
    x = self.relu(x)
    x = self.dropout2(x)
    x = self.fc2(x)
    x = self.dequant(x) # DeQuanStub를 forward가 끝나는 부분에 적어준다. LogSoftmax의 경우, 후에 추론 시에 사용할 데이터 형태인 QuantizedCPU형을 지원하지 않으므로, LogSofmax이전에 dequantization을 해준다.
    x = self.logsoftmax(x)
    
    return x

In [7]:
transform = transforms.Compose([
                                     transforms.ToTensor(),
                                     transforms.Normalize((0.1307,), (0.3081,))
    ])

device = 'cuda'

dataset1 = datasets.MNIST('../data', train=True, download=True,
                              transform = transform)
dataset2 = datasets.MNIST('../data', train=False, download=True,
                              transform = transform)


train_loader = torch.utils.data.DataLoader(dataset1, batch_size=64)
test_loader = torch.utils.data.DataLoader(dataset2, batch_size=64)


start = time.time()

model_fp32 = Net()
model_fp32.train() # 아래에 진행될 Quantization Aware Training logic이 작동하기 위해서는 모델을 train 모드로 바꿔줘야 한다고 한다.
model_fp32.qconfig = torch.quantization.get_default_qat_qconfig('fbgemm')
model_fp32_fused = torch.quantization.fuse_modules(model_fp32, [['conv1', 'relu']])
model_fp32_prepared = torch.quantization.prepare_qat(model_fp32_fused)
model_fp32_prepared = model_fp32_prepared.to("cuda")
optimizer = optim.SGD(model_fp32_prepared.parameters(), lr=0.01, momentum=0.5)


##CUDA를 이용해서 학습한다.
for epoch in range(3):
  for batch_idx, (data, target) in enumerate(train_loader):
      data, target = data.to(device), target.to(device)
      optimizer.zero_grad()
      output = model_fp32_prepared(data)
      loss = F.nll_loss(output, target)
      loss.backward()
      optimizer.step()
      if batch_idx & 10 == 0:
        print('Train Epoch: {} [{}/{} ({:.0f}%]\tLoss:{:.6f}'.format(
            epoch, batch_idx*len(data), len(train_loader.dataset),
            100.*batch_idx / len(train_loader), loss.item()
        ))


model_fp32_prepared.eval()
model_int8 = torch.quantization.convert(model_fp32_prepared.to('cpu')) #quantized aware training을 floating point로 수행한 model을 quantized integer model로 바꿔준다.



model_int8.eval()


test_loss = 0
correct = 0

start2 = time.time()   
with torch.no_grad():
  for data, target in test_loader:
    data, target = data.to('cpu'), target.to('cpu') #GPU는 integer형 연산을 지원하지 않으므로 추론 속도를 비교하기 위해서 모델과 data를 모두 cpu로 옮겨줬다.
    output = model_int8(data)
    test_loss += F.nll_loss(output, target, reduction='sum').item()
    pred = output.argmax(dim=1, keepdim=True)
    correct += pred.eq(target.view_as(pred)).sum().item()

test_loss /= len(test_loader.dataset)


print('\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format(
    test_loss, correct, len(test_loader.dataset),
    100. * correct / len(test_loader.dataset)
))

end = time.time()


print("test 이전까지 경과 시간(secs):",start2-start)
print("inference를 할 때 걸린 시간(secs):",end-start2)
print("total time elapsed(secs):", (end-start))

  reduce_range will be deprecated in a future release of PyTorch."


Train Epoch: 0 [0/60000 (0%]	Loss:2.302708
Train Epoch: 0 [64/60000 (0%]	Loss:2.269552
Train Epoch: 0 [256/60000 (0%]	Loss:2.113500
Train Epoch: 0 [320/60000 (1%]	Loss:2.120816
Train Epoch: 0 [1024/60000 (2%]	Loss:1.683994
Train Epoch: 0 [1088/60000 (2%]	Loss:1.645897
Train Epoch: 0 [1280/60000 (2%]	Loss:1.384257
Train Epoch: 0 [1344/60000 (2%]	Loss:1.454284
Train Epoch: 0 [2048/60000 (3%]	Loss:0.883511
Train Epoch: 0 [2112/60000 (4%]	Loss:0.803058
Train Epoch: 0 [2304/60000 (4%]	Loss:0.781473
Train Epoch: 0 [2368/60000 (4%]	Loss:0.961170
Train Epoch: 0 [3072/60000 (5%]	Loss:0.557012
Train Epoch: 0 [3136/60000 (5%]	Loss:0.605364
Train Epoch: 0 [3328/60000 (6%]	Loss:0.558819
Train Epoch: 0 [3392/60000 (6%]	Loss:0.622773
Train Epoch: 0 [4096/60000 (7%]	Loss:0.622581
Train Epoch: 0 [4160/60000 (7%]	Loss:0.577235
Train Epoch: 0 [4352/60000 (7%]	Loss:0.477506
Train Epoch: 0 [4416/60000 (7%]	Loss:0.576543
Train Epoch: 0 [5120/60000 (9%]	Loss:0.872257
Train Epoch: 0 [5184/60000 (9%]	Loss:0.41

In [8]:
class Net2(nn.Module):
  def __init__(self):
    super(Net2, self).__init__()
    self.conv1 = nn.Conv2d(1, 32, 3, 1)
    self.conv2 = nn.Conv2d(32, 64, 3, 1)
    self.relu = torch.nn.ReLU()
    self.dropout1= nn.Dropout(0.25)
    self.dropout2 = nn.Dropout(0.5)
    self.fc1 = nn.Linear(9216, 128)
    self.fc2 = nn.Linear(128, 10)
    self.maxpool = nn.MaxPool2d(2)
    self.logsoftmax = torch.nn.LogSoftmax(dim=1)



  def forward(self, x):
    x = self.conv1(x)
    x = self.relu(x)
    x = self.conv2(x)
    x = self.relu(x)
    x = self.maxpool(x)
    x = self.dropout1(x)
    x = torch.flatten(x, 1)
    x = self.fc1(x)
    x = self.relu(x)
    x = self.dropout2(x)
    x = self.fc2(x)
    x = self.logsoftmax(x)
    
    return x

In [9]:
start = time.time()

model_compared = Net2().to('cuda')
optimizer = optim.SGD(model_compared.parameters(), lr=0.01, momentum=0.5)
for epoch in range(3):
  model_compared.train()
  for batch_idx, (data, target) in enumerate(train_loader):
      data, target = data.to(device), target.to(device)
      optimizer.zero_grad()
      output = model_compared(data)
      loss = F.nll_loss(output, target)
      loss.backward()
      optimizer.step()
      if batch_idx & 10 == 0:
        print('Train Epoch: {} [{}/{} ({:.0f}%]\tLoss:{:.6f}'.format(
            epoch, batch_idx*len(data), len(train_loader.dataset),
            100.*batch_idx / len(train_loader), loss.item()
        ))


# Inference시의 속도를 Quantization Aware Trained된 모델과 동일한 기준으로 비교하기 위해 모델, 데이터를 cpu로 옮겨줘서 cpu로 연산한다.
model_compared.to('cpu')
model_compared.eval()


test_loss = 0
correct = 0

start2 = time.time()  
with torch.no_grad():
  for data, target in test_loader:
    data, target = data.to('cpu'), target.to('cpu')
    output = model_compared(data)
    test_loss += F.nll_loss(output, target, reduction='sum').item()
    pred = output.argmax(dim=1, keepdim=True)
    correct += pred.eq(target.view_as(pred)).sum().item()

test_loss /= len(test_loader.dataset)


print('\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format(
    test_loss, correct, len(test_loader.dataset),
    100. * correct / len(test_loader.dataset)
))

end = time.time()

print("test 이전까지 경과 시간(secs):",start2-start)
print("inference를 할 때 걸린 시간(secs):",end-start2)
print("total time elapsed(secs):", (end-start))

Train Epoch: 0 [0/60000 (0%]	Loss:2.304488
Train Epoch: 0 [64/60000 (0%]	Loss:2.318061
Train Epoch: 0 [256/60000 (0%]	Loss:2.255803
Train Epoch: 0 [320/60000 (1%]	Loss:2.244165
Train Epoch: 0 [1024/60000 (2%]	Loss:2.009878
Train Epoch: 0 [1088/60000 (2%]	Loss:1.965530
Train Epoch: 0 [1280/60000 (2%]	Loss:1.862439
Train Epoch: 0 [1344/60000 (2%]	Loss:1.728734
Train Epoch: 0 [2048/60000 (3%]	Loss:1.030696
Train Epoch: 0 [2112/60000 (4%]	Loss:0.945700
Train Epoch: 0 [2304/60000 (4%]	Loss:0.942555
Train Epoch: 0 [2368/60000 (4%]	Loss:1.190373
Train Epoch: 0 [3072/60000 (5%]	Loss:0.869754
Train Epoch: 0 [3136/60000 (5%]	Loss:0.766913
Train Epoch: 0 [3328/60000 (6%]	Loss:0.753187
Train Epoch: 0 [3392/60000 (6%]	Loss:0.703912
Train Epoch: 0 [4096/60000 (7%]	Loss:0.720226
Train Epoch: 0 [4160/60000 (7%]	Loss:0.670393
Train Epoch: 0 [4352/60000 (7%]	Loss:0.599435
Train Epoch: 0 [4416/60000 (7%]	Loss:0.780343
Train Epoch: 0 [5120/60000 (9%]	Loss:1.038589
Train Epoch: 0 [5184/60000 (9%]	Loss:0.51