In [1]:
import torch 
import torch.nn as nn
import torchvision
import torchvision.transforms as transforms

# 1. Hyper-parameters and Dataset

In [2]:
# Device configuration
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Hyper parameters
num_epochs = 5
num_classes = 10
batch_size = 100
learning_rate = 0.001

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
data_dir = '/content/drive/My Drive/PyTorch/Github_Series/02-intermediate/'

# MNIST dataset
train_dataset = torchvision.datasets.MNIST(root=data_dir,
                                           train=True, 
                                           transform=transforms.ToTensor(),
                                           download=True)

test_dataset = torchvision.datasets.MNIST(root=data_dir,
                                          train=False, 
                                          transform=transforms.ToTensor())

# Data loader
train_loader = torch.utils.data.DataLoader(dataset=train_dataset,
                                           batch_size=batch_size, 
                                           shuffle=True)

test_loader = torch.utils.data.DataLoader(dataset=test_dataset,
                                          batch_size=batch_size, 
                                          shuffle=False)

# 2. Modeling and Training

**Batch Normalization**
1. Accelerate deep network training. [Paper of Batch Normalization.](https://arxiv.org/abs/1502.03167)
2. Because the Batch Normalization is done over the C dimension, computing statistics on (N, H, W) slices, it’s common terminology to call this Spatial Batch Normalization.
3. A discussion about the [Ordering of batch normalization and activation.](https://stackoverflow.com/questions/39691902/ordering-of-batch-normalization-and-dropout#:~:text=So%20in%20summary%2C%20the%20order%20of%20using%20batch,%28or%20other%20activation%29%20-%3E%20Dropout%20-%3E%20CONV%2FFC%20-%3E), which is also discussed in the paper listed above. 

**Convolutional layer**
1. The formula for computing the output shape is described in [the PyTorch document](https://pytorch.org/docs/stable/generated/torch.nn.Conv2d.html). After *layer1* and *layer2*, we can compute the output shape of each filter is 7x7. 

In [5]:
# Convolutional neural network (two convolutional layers)
class ConvNet(nn.Module):

  def __init__(self, num_classes=10):
    super().__init__()
    # Use Sequential to create a small model.
    self.layer1 = nn.Sequential(
        nn.Conv2d(in_channels=1, out_channels=16, kernel_size=5, stride=1, padding=2),
        nn.BatchNorm2d(16),
        nn.ReLU(),
        nn.MaxPool2d(kernel_size=2, stride=2))
    self.layer2 = nn.Sequential(
        nn.Conv2d(16, 32, kernel_size=5, stride=1, padding=2),
        nn.BatchNorm2d(32),
        nn.ReLU(),
        nn.MaxPool2d(kernel_size=2, stride=2))
    self.fc = nn.Linear(7*7*32, num_classes)

  def forward(self, x):
    out = self.layer1(x)
    out = self.layer2(out)
    out = out.reshape(out.size(0), -1)
    out = self.fc(out)
    return out

In [6]:
model = ConvNet(num_classes).to(device)

# loss and optimizer
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

In [7]:
# Train the model
total_step = len(train_loader)
for epoch in range(num_epochs):
  for batch_id, (images, labels) in enumerate(train_loader):
    # keep model and data on the same device
    images = images.to(device)
    labels = labels.to(device)

    # Feedforward
    output = model.forward(images)
    loss = loss_fn(output, labels)

    # Backward propagation
    optimizer.zero_grad() 
    loss.backward()
    optimizer.step()

    if (batch_id+1) % 100 == 0:
      print('Epoch: [{}/{}], Step: [{}/{}], Loss: {:.4f}'
            .format(epoch+1, num_epochs, batch_id+1, total_step, loss.item()))

Epoch: [1/5], Step: [100/600], Loss: 0.1569
Epoch: [1/5], Step: [200/600], Loss: 0.1130
Epoch: [1/5], Step: [300/600], Loss: 0.1141
Epoch: [1/5], Step: [400/600], Loss: 0.0510
Epoch: [1/5], Step: [500/600], Loss: 0.0988
Epoch: [1/5], Step: [600/600], Loss: 0.0347
Epoch: [2/5], Step: [100/600], Loss: 0.0225
Epoch: [2/5], Step: [200/600], Loss: 0.0150
Epoch: [2/5], Step: [300/600], Loss: 0.0189
Epoch: [2/5], Step: [400/600], Loss: 0.0863
Epoch: [2/5], Step: [500/600], Loss: 0.0367
Epoch: [2/5], Step: [600/600], Loss: 0.0338
Epoch: [3/5], Step: [100/600], Loss: 0.0165
Epoch: [3/5], Step: [200/600], Loss: 0.0385
Epoch: [3/5], Step: [300/600], Loss: 0.0295
Epoch: [3/5], Step: [400/600], Loss: 0.0248
Epoch: [3/5], Step: [500/600], Loss: 0.0270
Epoch: [3/5], Step: [600/600], Loss: 0.0218
Epoch: [4/5], Step: [100/600], Loss: 0.0048
Epoch: [4/5], Step: [200/600], Loss: 0.0631
Epoch: [4/5], Step: [300/600], Loss: 0.0021
Epoch: [4/5], Step: [400/600], Loss: 0.0268
Epoch: [4/5], Step: [500/600], L

# 3. Test the model

In [8]:
# Test the model
model.eval()  # eval mode (batchnorm uses moving mean/variance instead of mini-batch mean/variance)
with torch.no_grad():
  total = 0
  correct = 0
  for images, labels in test_loader:
    images = images.to(device)
    labels = labels.to(device)
    output = model(images)
    _, pred = torch.max(output, dim=1)
    total += labels.size(0)
    correct += (labels == pred).sum()

  print('Test Accuracy of the model on the 10000 test images: {} %'.format(100 * correct / total))

Test Accuracy of the model on the 10000 test images: 98.97000122070312 %
