## Downloading datasets
First we need to download the datasets and load them into batches. To do this we use PyTorch's DataLoader class.

The dataset used in this implementation is [PascalVOC 2012](http://host.robots.ox.ac.uk/pascal/VOC/voc2012/), which is commonly used in object detection problems. This dataset is XML-based.

The original AlexNet paper uses ImageNet, a very large dataset of labeled high-res images. PascalVOC, similarly to ImageNet, contains variable resolution images. Therefore it is important to down-scale image resolution due to AlexNet's requirement of constant input dimensionality. 

`Therefore, we down-sampled the images to a fixed resolution of 256 × 224. Given a
rectangular image, we first rescaled the image such that the shorter side was of length 224, and then
cropped out the central 224×224 patch from the resulting image. We did not pre-process the images
in any other way, except for subtracting the mean activity over the training set from each pixel. So
we trained our network on the (centered) raw RGB values of the pixels`

In [None]:
import torch
from torch.utils.data import DataLoader
import torchvision
from torchvision import transforms

In [None]:
class ResizeCollate:
    def __init__(self, resize_size=(227, 227)):
        assert isinstance(resize_size, (int, tuple))
        self.resize_size = resize_size

    def __call__(self, batch):
        images, labels = zip(*batch)
        resized_images = [transforms.functional.resize(img, self.resize_size, antialias=True) for img in images]
        batched_images = torch.stack(resized_images)
        return batched_images, labels

transform = transforms.Compose([
    transforms.Resize(227),
    transforms.CenterCrop(227),
    # transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
    transforms.ToTensor(),
])

In [None]:
train_dataset = torchvision.datasets.VOCDetection(
    root='./data',
    year='2012',
    image_set='train',
    transform=transform,
    download=True
)

train_loader = DataLoader(
    dataset=train_dataset,
    batch_size=128,
    shuffle=True,
    collate_fn=ResizeCollate()
)

test_dataset = torchvision.datasets.VOCDetection(
    root='./data',
    year='2012',
    image_set='val',
    transform=transform,
    download=True
)

test_loader = DataLoader(
    dataset=test_dataset,
    batch_size=16,
    shuffle=False,
    collate_fn=ResizeCollate()
)


In [None]:
from torchvision.utils import make_grid
import matplotlib.pyplot as plt

from itertools import islice

num = 5

def show_batch(batch_images):
    # for images, _ in dl:
    fig,ax = plt.subplots(figsize=(16,12))
    ax.set_xticks([])
    ax.set_yticks([])
    ax.imshow(make_grid(batch_images,nrow=16).permute(1,2,0))
        # break

for batch_images, _ in islice(train_loader, num):
    show_batch(batch_images)

In [None]:
from torch import nn

class AlexNet(torch.nn.Module):

    def __init__(self):
        super(AlexNet, self).__init__()

        # Convolutional layers
        self.layer1 = nn.Sequential(
            nn.Conv2d(3, 96, kernel_size=11, stride=4, padding=0),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=3, stride=2)
        )

        self.layer2 = nn.Sequential(
            nn.Conv2d(96, 256, kernel_size=5, stride=1, padding=2),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=3, stride=2)
        )   

        self.layer3 = nn.Sequential(
            nn.Conv2d(256, 384, kernel_size=3, stride=1, padding=1),
            nn.ReLU()
        )

        self.layer4 = nn.Sequential(
            nn.Conv2d(384, 384, kernel_size=3, stride=1, padding=1),
            nn.ReLU()
        )

        self.layer5 = nn.Sequential(
            nn.Conv2d(384, 256, kernel_size=3, stride=1, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=3, stride=2)
        )

        # Fully Connected Layers
        self.fc = nn.Sequential(
            nn.Dropout(0.5),
            nn.Linear(9216, 4096),
            nn.ReLU())

        self.fc1 = nn.Sequential(
            nn.Dropout(0.5),
            nn.Linear(4096, 4096),
            nn.ReLU())

        self.fc2= nn.Sequential(
            nn.Linear(4096, 20))

    def forward(self, x):
        x = self.layer1(x)
        x = self.layer2(x)
        x = self.layer3(x)
        x = self.layer4(x)
        x = self.layer5(x)

        x = self.fc(x)
        x = self.fc1(x)
        x = self.fc2(x)

        return x

In [None]:

class AlexNet(nn.Module):
    def __init__(self, num_classes=20):
        super(AlexNet, self).__init__()
        self.layer1 = nn.Sequential(
            nn.Conv2d(3, 96, kernel_size=11, stride=4, padding=0),
            nn.BatchNorm2d(96),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size = 3, stride = 2))
        self.layer2 = nn.Sequential(
            nn.Conv2d(96, 256, kernel_size=5, stride=1, padding=2),
            nn.BatchNorm2d(256),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size = 3, stride = 2))
        self.layer3 = nn.Sequential(
            nn.Conv2d(256, 384, kernel_size=3, stride=1, padding=1),
            nn.BatchNorm2d(384),
            nn.ReLU())
        self.layer4 = nn.Sequential(
            nn.Conv2d(384, 384, kernel_size=3, stride=1, padding=1),
            nn.BatchNorm2d(384),
            nn.ReLU())
        self.layer5 = nn.Sequential(
            nn.Conv2d(384, 256, kernel_size=3, stride=1, padding=1),
            nn.BatchNorm2d(256),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size = 3, stride = 2))
        self.fc = nn.Sequential(
            nn.Dropout(0.5),
            nn.Linear(9216, 4096),
            nn.ReLU())
        self.fc1 = nn.Sequential(
            nn.Dropout(0.5),
            nn.Linear(4096, 4096),
            nn.ReLU())
        self.fc2= nn.Sequential(
            nn.Linear(4096, num_classes))
        
    def forward(self, x):
        out = self.layer1(x)
        out = self.layer2(out)
        out = self.layer3(out)
        out = self.layer4(out)
        out = self.layer5(out)
        out = out.reshape(out.size(0), -1)
        out = self.fc(out)
        out = self.fc1(out)
        out = self.fc2(out)
        return out

In [None]:
alexnet = AlexNet()

print(alexnet)

In [None]:
import torch.optim as optim

criterion = torch.nn.CrossEntropyLoss()
optimiser = optim.SGD(alexnet.parameters(), momentum=0.9, lr=0.005, weight_decay=0.005)

In [None]:
for epoch in range(2):
    running_loss = 0.0
    for i, data in enumerate(train_loader, 0):
        inputs, labels = data
        optimiser.zero_grad()

        outputs = alexnet(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimiser.step()

        running_loss += loss.item()
        if i % 2000 == 1999:
            print('[%d, %5d] loss: %.3f' % (epoch + 1, i + 1, running_loss / 2000))
            running_loss = 0.0

print('Finished Training')

In [None]:
import numpy as np

def imshow(img):
    img = img / 2 + 0.5     # unnormalize
    npimg = img.numpy()
    plt.imshow(np.transpose(npimg, (1, 2, 0)))
    plt.show()

dataiter = iter(test_loader)
images, labels = next(dataiter)

# print images
imshow(torchvision.utils.make_grid(images))
groundtruths = ' '.join(f'{classes[labels[j]]:5s}' for j in range(4))
print(f"GroundTruth: {groundtruths}")