# VGG Network

<div>
    <img src="./pic/structure.png", width="80%">
</div>

## Step1. Import the libraries

In [None]:
# import libraries
import numpy as np
import torch
import torch.nn as nn
from tqdm.notebook import tqdm
from torchvision import datasets
from torchvision import transforms
from torch.utils.data.sampler import SubsetRandomSampler

# Device configuration
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# release the memory
torch.cuda.empty_cache()

## Step2. Load the dataset

```torchvision``` is a library that provides easy access to tons of computer vision datasets and methods to pre-process these datasets in an easy and intuitive manner

For VGG model, some points need to be paid attention:
1.  Defining the variable normalize with the mean and standard deviations of each of the channel (red, green, and blue) in the dataset. And transform variable is used where the data was resized, converted to tensors and then normalized.
2.  If ```test``` argument is true, loading the split of test dataset. Else if it is false (default), loading the train split and it will be randomly split into train and validation set (0.9:0.1).
3. Large dataset may affect the performance, so data loaders allow us to iterate through the data in batches, and the data is loaded while iterating and not all at once in start into the RAM

In [None]:
def data_loader(data_dir, batch_size, random_seed=42,
                valid_size=0.1, shuffle=True, test=False):
    # R, G, B channels
    normalize = transforms.Normalize(
        mean=[0.4914, 0.4822, 0.4465],
        std=[0.2023, 0.1994, 0.2010],
    )

    # Define Transforms, Compose Function zip some operation in order
    transform = transforms.Compose([
        transforms.Resize((227, 227)),
        transforms.ToTensor(),
        normalize,
    ])

    # load test dataset
    if test:
        test_dataset = datasets.CIFAR100(
            root=data_dir, train=False,
            download=True, transform=transform
        )

        data_loader = torch.utils.data.DataLoader(
            test_dataset, batch_size=batch_size, shuffle=shuffle
        )

        return data_loader

    # load train datasets
    train_dataset = datasets.CIFAR100(
        root=data_dir, train=True,
        download=True, transform=transform
    )

    valid_dataset = datasets.CIFAR10(
        root=data_dir, train=True,
        download=True, transform=transform
    )

    num_train = len(train_dataset)
    indices = list(range(int(num_train)))
    split = int(np.floor(valid_size * num_train))

    if shuffle:
        np.random.seed(random_seed)
        np.random.shuffle(indices)
    train_idx, valid_idx, = indices[split:], indices[:split]
    train_sampler = SubsetRandomSampler(train_idx)
    valid_sampler = SubsetRandomSampler(valid_idx)

    train_loader = torch.utils.data.DataLoader(
        train_dataset, batch_size=batch_size, sampler=train_sampler
    )

    valid_loader = torch.utils.data.DataLoader(
        valid_dataset, batch_size=batch_size, sampler=valid_sampler
    )
    return train_loader, valid_loader


# CIFAR-100 dataset
train_loader, valid_loader = data_loader(data_dir='./data', batch_size=16)

test_loader = data_loader(data_dir='./data', batch_size=16, test=True)

## Step3. Build A VGG Model

For torch, some points should be paid attention:
1. Every custom models need to inherit from the ``nn.Module``` class as it provides some basic functionality that helps the model to train.
2. Two things should be done first. First, define the different layers of our model inside the __init__ function and the sequence in which these layers will be executed on the input inside the forward function.

And some modules were used to define the layer:
1. ```nn.Conv2d```: convolutional layers
2. ```nn.BatchNorm2d```: batch normalization
3. ```nn.ReLU```: activation function
4. ```nn.MaxPool2d```:  max pooling, avoid to be overfitting
5. ```nn.Linear```: fc layer
6. ```nn.Sequential```: This is module helps in combining different operations that are part of the same step.

In [None]:
class VGG16(nn.Module):
    def __init__(self, num_classes=10):
        super(VGG16, self).__init__()
        self.layer1 = nn.Sequential(
            # input channel, output channel(kernel, random init)
            nn.Conv2d(3, 64, kernel_size=3, stride=1, padding=1),
            nn.BatchNorm2d(64),
            nn.ReLU())
        # 64 dim
        self.layer2 = nn.Sequential(
            nn.Conv2d(64, 64, kernel_size=3, stride=1, padding=1),
            nn.BatchNorm2d(64),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size = 2, stride = 2))
        self.layer3 = nn.Sequential(
            nn.Conv2d(64, 128, kernel_size=3, stride=1, padding=1),
            nn.BatchNorm2d(128),
            nn.ReLU())
        # 128 dim
        self.layer4 = nn.Sequential(
            nn.Conv2d(128, 128, kernel_size=3, stride=1, padding=1),
            nn.BatchNorm2d(128),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size = 2, stride = 2))
        self.layer5 = nn.Sequential(
            nn.Conv2d(128, 256, kernel_size=3, stride=1, padding=1),
            nn.BatchNorm2d(256),
            nn.ReLU())
        # 256 dim
        self.layer6 = nn.Sequential(
            nn.Conv2d(256, 256, kernel_size=3, stride=1, padding=1),
            nn.BatchNorm2d(256),
            nn.ReLU())
        self.layer7 = nn.Sequential(
            nn.Conv2d(256, 256, kernel_size=3, stride=1, padding=1),
            nn.BatchNorm2d(256),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size = 2, stride = 2))
        self.layer8 = nn.Sequential(
            nn.Conv2d(256, 512, kernel_size=3, stride=1, padding=1),
            nn.BatchNorm2d(512),
            nn.ReLU())
        # 512 dim
        self.layer9 = nn.Sequential(
            nn.Conv2d(512, 512, kernel_size=3, stride=1, padding=1),
            nn.BatchNorm2d(512),
            nn.ReLU())
        self.layer10 = nn.Sequential(
            nn.Conv2d(512, 512, kernel_size=3, stride=1, padding=1),
            nn.BatchNorm2d(512),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size = 2, stride = 2))
        self.layer11 = nn.Sequential(
            nn.Conv2d(512, 512, kernel_size=3, stride=1, padding=1),
            nn.BatchNorm2d(512),
            nn.ReLU())
        self.layer12 = nn.Sequential(
            nn.Conv2d(512, 512, kernel_size=3, stride=1, padding=1),
            nn.BatchNorm2d(512),
            nn.ReLU())
        self.layer13 = nn.Sequential(
            nn.Conv2d(512, 512, kernel_size=3, stride=1, padding=1),
            nn.BatchNorm2d(512),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size = 2, stride = 2))
        self.fc = nn.Sequential(
            nn.Dropout(0.5),
            nn.Linear(7*7*512, 4096),
            nn.ReLU())
        self.fc1 = nn.Sequential(
            nn.Dropout(0.5),
            nn.Linear(4096, 4096),
            nn.ReLU())
        self.fc2= nn.Sequential(
            nn.Linear(4096, num_classes))

    def forward(self, x):
        out = self.layer1(x)
        out = self.layer2(out)
        out = self.layer3(out)
        out = self.layer4(out)
        out = self.layer5(out)
        out = self.layer6(out)
        out = self.layer7(out)
        out = self.layer8(out)
        out = self.layer9(out)
        out = self.layer10(out)
        out = self.layer11(out)
        out = self.layer12(out)
        out = self.layer13(out)
        out = out.reshape(out.size(0), -1)
        out = self.fc(out)
        out = self.fc1(out)
        out = self.fc2(out)
        return out

## Step4. Hyperparameters

In [None]:
import gc

torch.cuda.empty_cache()
num_classes = 100
num_epochs = 20
batch_size = 16
learning_rate = 0.005

gc.collect()
torch.cuda.empty_cache()
model = VGG16(num_classes).to(device)


# Loss and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate, weight_decay = 0.005, momentum = 0.9)


# Train the model
total_step = len(train_loader)

## Step5. Train

1. set the gradients to zero before every update, ```optimizer.zero_grad()```
2. In validation part, use ```with torch.no_grad()``` to disable gradient to faster evaluation.

In [None]:
print(total_step)

In [None]:
total_steps = len(train_loader)
target_epoch = 2
print(total_steps)

for epoch in range(num_epochs):
    if epoch == target_epoch:
        break
    for i, (images, labels) in tqdm(enumerate(train_loader), total=total_step):
        # Move tensors to the configured device
        images = images.to(device)
        labels = labels.to(device)

        # Forward pass
        outputs = model(images)
        loss = criterion(outputs, labels)

        # Backward and optimize
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    print ('Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}'
                   .format(epoch+1, num_epochs, i+1, total_step, loss.item()))

    # Validation
    with torch.no_grad():
        correct = 0
        total = 0
        for images, labels in valid_loader:
            images = images.to(device)
            labels = labels.to(device)
            gc.collect()
            torch.cuda.empty_cache()
            outputs = model(images)
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
            del images, labels, outputs

        print('Accuracy of the network on the {} validation images: {} %'.format(5000, 100 * correct / total))

## Step6. Test

In [None]:
with torch.no_grad():
    correct = 0
    total = 0
    for images, labels in test_loader:
        images = images.to(device)
        labels = labels.to(device)
        outputs = model(images)
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()
        del images, labels, outputs

    print('Accuracy of the network on the {} test images: {} %'.format(10000, 100 * correct / total))