In [6]:
import torch
import torch.nn as nn
from torchvision import datasets, transforms
import torch.optim as optim
from datetime import datetime
import numpy as np
from torch.utils.data import random_split

# Hint from section 3.3.
torch.manual_seed(123)
torch.set_default_dtype(torch.double)

In [7]:
import ssl
ssl._create_default_https_context = ssl._create_unverified_context

# 1. Loading data

In [8]:
# 1. Loading data
def load_cifar(train_val_split=0.9, data_path='../data/', mean_std_matrix=None):
    
    # This code is based on the code from the weekly exercises
    
    # Define preprocessor specific for CIFAR-10. The normalization 
    # is done to better accommodate for faster training.
    if mean_std_matrix is None:
        preprocessor = transforms.ToTensor()
        
        
    else:
        preprocessor = transforms.Compose([
            transforms.ToTensor(),
            transforms.Normalize((mean_std_matrix[0,0], mean_std_matrix[0,1], mean_std_matrix[0,2]),
                                (mean_std_matrix[1,0], mean_std_matrix[1,1], mean_std_matrix[1,2]))])
        
            
    # Load datasets.
    data_train_val = datasets.CIFAR10(
        data_path,       
        train=True,      
        download=True,
        transform=preprocessor)

    data_test = datasets.CIFAR10(
        data_path, 
        train=False,
        download=True,
        transform=preprocessor)

    # Train/Validation split.
    n_train = int(len(data_train_val)*train_val_split)
    n_val =  len(data_train_val) - n_train

    data_train, data_val = random_split(
        data_train_val, 
        [n_train, n_val],
        generator=torch.Generator().manual_seed(123)
    )
    return (data_train, data_val, data_test)

# Collecting the training set to be able to calculate the mean and std values
# for normalization
cifar10_train, _, _ = load_cifar()

# Now we define a lighter version of CIFAR-10, CIFAR-2.
label_map = {0: 0, 2: 1}
class_names = ['airplane', 'bird']

# For each dataset, keep only airplanes and birds.
cifar2_train = [(img, label_map[label]) for img, label in cifar10_train if label in [0, 2]]
cifar2_imgs = [train_tuple[0].numpy() for train_tuple in cifar2_train]

# We calculate the training set's mean and std for each rgb channel
means = np.mean(cifar2_imgs, axis=(0, 2, 3))
stds = np.std(cifar2_imgs, axis=(0, 2, 3))
mean_std_matrix = np.array([means, stds])

print('\nThe mean_std_matrix used for dataset standardization is: \n')
print(str(mean_std_matrix) + '\n')

# With the mean and std values, we collect the appropriately normalized CIFAR-2 evaluation sets
cifar10_train, cifar10_val, cifar10_test = load_cifar(mean_std_matrix=mean_std_matrix)


cifar2_val = [(img, label_map[label]) for img, label in cifar10_val if label in [0, 2]]
cifar2_test = [(img, label_map[label]) for img, label in cifar10_test if label in [0, 2]]

print('\nDone splitting and normalizing the CIFAR-2 dataset!\n')
print('Size of the training dataset: ', len(cifar2_train))
print('Size of the validation dataset: ', len(cifar2_val))
print('Size of the test dataset: ', len(cifar2_test))




Downloading https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz to ../data/cifar-10-python.tar.gz


  0%|          | 0/170498071 [00:00<?, ?it/s]

Extracting ../data/cifar-10-python.tar.gz to ../data/
Files already downloaded and verified

The mean_std_matrix used for dataset standardization is: 

[[0.50758954 0.52643419 0.50727447]
 [0.23966236 0.23350179 0.26788819]]

Files already downloaded and verified
Files already downloaded and verified

Done splitting and normalizing the CIFAR-2 dataset!

Size of the training dataset:  9017
Size of the validation dataset:  983
Size of the test dataset:  2000


# 2. Defining a Multi-Layer Perceptron in PyTorch

In [3]:
# 2. Defining a Multi-Layer Perceptron in PyTorch
class MyMLP(nn.Module):
    def __init__(self):
        super().__init__()
        
        # Flattening the input so as to have 32*32*3 neurons as input in a one-
        # dimensional vector.
        self.L = 4
        self.flat = nn.Flatten() 

        # The instructions from the task are set.
        self.fc1 = nn.Linear(32*32*3, 512)
        self.act1 = nn.ReLU()
        self.fc2 = nn.Linear(512, 128)
        self.act2 = nn.ReLU()
        self.fc3 = nn.Linear(128, 32)
        self.act3 = nn.ReLU()

        self.fc4 = nn.Linear(32, 2)
        # Leaving out an activation function for the last layer according to the
        # project description.
        
    def forward(self, x):

        # Forward-propagation through the network.
        out = self.flat(x)
        out = self.act1(self.fc1(out))
        out = self.act2(self.fc2(out))
        out = self.act3(self.fc3(out))
        out = self.fc4(out)
        
        return out
    

# 3. train()-function

In [4]:
# 3. train()-function
def train(n_epochs, optimizer, model, loss_fn, train_loader):
    
    # Setting a few standard variables and empty lists.
    n_batch = len(train_loader)
    losses_train = []
    
    # The model is set to be trained by a PyTorch method.
    model.train()
    optimizer.zero_grad(set_to_none=True)
    
    for epoch in range(1, n_epochs+1):
        
        loss_train = 0.0
        
        # This for-loop calculates loss and makes a gradient descent step.
        for imgs, labels in train_loader:
            
            # Following the hint in section 3.3.
            imgs = imgs.to(device=device, dtype=torch.double) 
            labels = labels.to(device=device)
            
            # Forward-propagates the input.
            outputs = model(imgs)
            
            # The loss is computed, and a backward-propagation is done.
            loss = loss_fn(outputs, labels)
            loss.backward()
            
            # Gradient descent is done, and the gradients are zeroed out for
            # each minibatch.
            optimizer.step()
            optimizer.zero_grad()

            # The training losses are stored for each gradient descent step.
            loss_train += loss.item()
            
        # The average of the losses computed for one epoch. 
        losses_train.append(loss_train/n_batch)

        # Motivational text keeping the user updated on the progress.
        if epoch%10 == 0 or epoch == 1:
            print('{}  |  Epoch {}  |  Training loss {:.3f}'.format(
                datetime.now().time(), epoch, loss_train/n_batch))
    return losses_train


# 4. Manual training function

In [5]:
# 4. Manual training function
def train_manual_update(n_epochs, lr, model, loss_fn, train_loader):
    
    # Setting a few standard variables and empty lists.
    n_batch = len(train_loader)
    losses_train = []
    
    # The model is set to be trained by a PyTorch method.
    model.train()
    
    for epoch in range(1, n_epochs+1):
        loss_train = 0.0
        for imgs, labels in train_loader:
            
            # Following the hint in section 3.3.
            imgs = imgs.to(device=device, dtype=torch.double) 
            labels = labels.to(device=device)
                
            # Forward-propagates the input.
            outputs = model(imgs)
                
            # The loss is computed, and a backward-propagation is done.
            loss = loss_fn(outputs, labels)
            loss.backward()
            
            with torch.no_grad():
                
                # Iterating over the model parameters.
                for p in model.parameters():
                    
                    # Applying the formula for gradient descent.
                    p.data = p.data - lr * p.grad
                    p.grad = torch.zeros_like(p.grad)

            # The training losses are stored for each gradient descent step.
            loss_train += loss.item()
        
        # The average of the losses computed for one epoch. 
        losses_train.append(loss_train/n_batch)

        # Motivational text keeping the user updated on the progress.
        if epoch%10 == 0 or epoch == 1:
            print('{}  |  Epoch {}  |  Training loss {:.3f}'.format(
                datetime.now().time(), epoch, loss_train / n_batch))    
    return losses_train


# 5. Training 2 instances of MyMLP

In [6]:
# 5. Training 2 instances of MyMLP

# We define a function for verifying similarity of training losses. This takes
# care of numerical nuances.
def relative_error(a, b):
    return (a - b) / a

# Evaluating the processing unit to train on.
device = (torch.device('cuda') if torch.cuda.is_available()
          else torch.device('cpu'))
print(f"Training on device {device}.")

# The train loader is set with an appropriate batch size.
train_loader = torch.utils.data.DataLoader(cifar2_train, batch_size=256, shuffle=False)

# A seed must be set before each instantiation of the model for reproducibility.
torch.manual_seed(123)
model = MyMLP().to(device=device)

# The optimizer and the loss function is set.
optimizer = optim.SGD(model.parameters(), lr=1e-2)
loss_fn = nn.CrossEntropyLoss()

# This function utilizes PyTorch's already written code for the SGD optimizer.
losses_train_SGD = train(
    n_epochs = 20,
    optimizer = optimizer,
    model = model,
    loss_fn = loss_fn,
    train_loader = train_loader
)

# Re-instantiating model = MyMLP() with the same seed.
torch.manual_seed(123)
model = MyMLP().to(device=device)

# The manual gradient descent algorithm is contained in this training function.
losses_train_manual = train_manual_update(
    n_epochs = 20,
    lr = 1e-2,
    model = model,
    loss_fn = loss_fn,
    train_loader = train_loader
)
# This code checks whether the models coincide, where a threshold is 
# set to classify numerical losses.
list_of_trues = []
for i, _ in enumerate(losses_train_SGD):
    if abs(relative_error(losses_train_SGD[i], losses_train_manual[i])) <= 10**(-10):
        list_of_trues.append(True)
if len(list_of_trues) == len(losses_train_SGD):
    print("The train-functions seem to output the same models!")

Training on device cpu.
21:44:41.755841  |  Epoch 1  |  Training loss 0.695
21:44:53.867573  |  Epoch 10  |  Training loss 0.581
21:45:07.845948  |  Epoch 20  |  Training loss 0.502
21:45:09.428939  |  Epoch 1  |  Training loss 0.695
21:45:24.170698  |  Epoch 10  |  Training loss 0.581
21:45:40.033089  |  Epoch 20  |  Training loss 0.502
The train-functions seem to output the same models!


# 6. Adding regularization/weight decay to the manual train function

In [7]:
# 6. Adding regularization/weight decay to the manual train function
def train_manual_update_with_L2(n_epochs, lr, model, loss_fn, train_loader, weight_decay):
    
    # Setting a few standard variables and empty lists.
    n_batch = len(train_loader)
    losses_train = []
    
    # The model is set to be trained by a PyTorch method.
    model.train()
    
    for epoch in range(1, n_epochs + 1):
        
        loss_train = 0.0
        
        for imgs, labels in train_loader:

            # Following the hint in section 3.3.
            imgs = imgs.to(device=device, dtype=torch.double) 
            labels = labels.to(device=device)
            
            # Forward-propagates the input.
            outputs = model(imgs)
            
            # The losses are computed, and a backward-propagation is done.
            loss = loss_fn(outputs, labels)
            loss.backward()
            
            
            with torch.no_grad():
                
                # Iterating over the model parameters
                for p in model.parameters():
                    
                    # Applying the formula for weight-decay/L2-regularization
                    # Note that the lambda has been subsituted by 
                    # a hyperparameter weight_decay as the .pdf-file suggests.
                    # (see the parenthesis). 
                    # Applying the formula for gradient descent in the same line.
                    p.data = p.data - lr * (p.grad + weight_decay * p.data)
                    p.grad = torch.zeros_like(p.grad)
            
            # The training losses are stored for each gradient descent step.
            loss_train += loss.item()
            
        # The average of the losses computed for one epoch.
        losses_train.append(loss_train/n_batch)

        # Motivational text keeping the user updated on the progress.
        if epoch%10 == 0 or epoch == 1:
            print('{}  |  Epoch {}  |  Training loss {:.3f}'.format(
            datetime.now().time(), epoch, loss_train / n_batch))    
    return losses_train


In [8]:
#%% This section runs the code for the same inputs as previous models, but now with a weight decay term.
# We define a function for verifying similarity of training losses. This takes
# care of numerical nuances.
def relative_error(a, b):
    return (a - b) / a

# Evaluating the processing unit to train on.
device = (torch.device('cuda') if torch.cuda.is_available()
          else torch.device('cpu'))
print(f"Training on device {device}.")

train_loader = torch.utils.data.DataLoader(cifar2_train, batch_size=256, shuffle=False)


# Seed must be set before each instantiation of the model.
torch.manual_seed(123)
model = MyMLP().to(device=device)

# The optimizer and the loss function is set.
optimizer = optim.SGD(model.parameters(), lr=1e-2, weight_decay=0.01)
loss_fn = nn.CrossEntropyLoss()

# This function utilizes PyTorch's already written code for the SGD optimimzer.
losses_train_SGD = train(
    n_epochs = 20,
    optimizer = optimizer,
    model = model,
    loss_fn = loss_fn,
    train_loader = train_loader
)

# Re-instantiating model = MyMLP() with the same seed.
torch.manual_seed(123)
model = MyMLP().to(device=device)

# The manual gradient descent algorithm is contained in this training function.
losses_train_manual_weight_decay = train_manual_update_with_L2(
    n_epochs = 20,
    lr = 1e-2,
    model = model,
    loss_fn = loss_fn,
    train_loader = train_loader,
    weight_decay = 0.01
)

# This code checks whether the models coincide, where a threshold is 
# set to classify numerical losses.
list_of_trues = []
for i, _ in enumerate(losses_train_SGD):
    if abs(relative_error(losses_train_SGD[i], losses_train_manual_weight_decay[i])) <= 10**(-10):
        list_of_trues.append(True)
if len(list_of_trues) == len(losses_train_SGD):
    print("The train-functions seem to output the same models!")


Training on device cpu.
21:46:34.853788  |  Epoch 1  |  Training loss 0.695
21:46:48.595777  |  Epoch 10  |  Training loss 0.593
21:47:03.543391  |  Epoch 20  |  Training loss 0.506
21:47:05.473513  |  Epoch 1  |  Training loss 0.695
21:47:23.069549  |  Epoch 10  |  Training loss 0.593
21:47:42.295341  |  Epoch 20  |  Training loss 0.506
The train-functions seem to output the same models!


# 7. Adding momentum to the manual training function

In [9]:
#%% 7. Adding momentum to the manual training function
def train_manual_update_with_L2_decay_momentum(n_epochs, lr, model, loss_fn, train_loader, weight_decay, momentum_coeff):
    
    # Setting a few standard variables and empty lists
    n_batch = len(train_loader)
    losses_train = []
    start_gradients = []
    
    # We follow Andrew's notations here for the momentum gradients. NOTE:
    # vdW_vdb implies derivatives wrt weights and biases.
    vdW_vdb = []
    
    # The model is set to be trained by a PyTorch method
    model.train()
    
    for epoch in range(1, n_epochs + 1):
        
        loss_train = 0.0
        
        for imgs, labels in train_loader:

            # Following the hint in section 3.3.
            imgs = imgs.to(device=device, dtype=torch.double) 
            labels = labels.to(device=device)
            
            # Forward-propagates the input
            outputs = model(imgs)
            
            # The losses are computed, and a backward-propagation is done.
            loss = loss_fn(outputs, labels)
            loss.backward()
            
            
            with torch.no_grad():
                
                # Iterating over the model parameters
                # Here we need the index i for the momentum calculation.
                for i, p in enumerate(model.parameters()):
                    
                    # Applying the formula for weight-decay/L2-regularization
                    # Note that the lambda has been subsituted by 
                    # a hyperparameter weight_decay as the .pdf-file suggests.
                    p.grad = p.grad + weight_decay * p.data
                    
                    # These if and else statements will store the gradients
                    # for the first minibatch.
                    if len(vdW_vdb) == 0:
                        if len(start_gradients) == 0:
                            start_gradients = [p.grad]
                        else:
                            start_gradients.append(p.grad)
                    else:
                        
                        # NOTE! The (1 - momentum_coeff) term has been omitted,
                        # because that gave the same result as the train()-
                        # function with the SGD optimizer from PyTorch.
                        
                        # Updating the gradient of the weights and biases
                        # in their respective places in the network (i denotes
                        # where in the network we are).
                        p.grad = vdW_vdb[i] * momentum_coeff + p.grad
                        vdW_vdb[i] = p.grad
                        
                    
                    # Applying the formula for gradient descent.
                    p.data = p.data - lr * p.grad
                    p.grad = torch.zeros_like(p.grad)
            
            # This if-statement is only satisfied for the first minibatch: it stores
            # the gradients of the first minibatch such that the gradients for
            # the second mini-batch can build upon the "first gradients" for momentum
            # calculation. 
            if len(vdW_vdb) == 0:
                vdW_vdb = start_gradients
            
            # The training losses are stored for each gradient descent step.
            loss_train += loss.item() 
            
        # The average of the losses computed for one epoch.
        losses_train.append(loss_train/n_batch)

        # Motivational text keeping the user updated on the progress.
        if epoch%10 == 0 or epoch == 1:
            print('{}  |  Epoch {}  |  Training loss {:.3f}'.format(
            datetime.now().time(), epoch, loss_train / n_batch))
    return losses_train


In [10]:
#%% This section runs the code for the same inputs as previous models, but now with a weight decay term and a momentum coefficient.
# We define a function for verifying similarity of training losses. This takes
# care of numerical nuances.
def relative_error(a, b):
    return (a - b) / a

# Evaluating the processing unit to train on.
device = (torch.device('cuda') if torch.cuda.is_available()
          else torch.device('cpu'))
print(f"Training on device {device}.")

train_loader = torch.utils.data.DataLoader(cifar2_train, batch_size=256, shuffle=False)


# Seed must be set before each instantiation of the model.
torch.manual_seed(123)
model = MyMLP().to(device=device)

# The optimizer and the loss function is set.
optimizer = optim.SGD(model.parameters(), lr=1e-2, weight_decay=0.01, momentum=0.9)
loss_fn = nn.CrossEntropyLoss()

# This function utilizes PyTorch's already written code for the SGD optimimzer.
losses_train_SGD = train(
    n_epochs = 20,
    optimizer = optimizer,
    model = model,
    loss_fn = loss_fn,
    train_loader = train_loader
)

# Re-instantiating model = MyMLP() with the same seed.
torch.manual_seed(123)
model = MyMLP().to(device=device)

# The manual gradient descent algorithm is contained in this training function.
losses_train_manual_weight_decay_momentum = train_manual_update_with_L2_decay_momentum(
    n_epochs = 20,
    lr = 1e-2,
    model = model,
    loss_fn = loss_fn,
    train_loader = train_loader,
    weight_decay = 0.01,
    momentum_coeff = 0.9
)

# This code checks whether the models coincide, where a threshold is 
# set to classify numerical losses.
list_of_trues = []
for i, _ in enumerate(losses_train_SGD):
    if abs(relative_error(losses_train_SGD[i], losses_train_manual_weight_decay_momentum[i])) <= 10**(-10):
        list_of_trues.append(True)
if len(list_of_trues) == len(losses_train_SGD):
    print("The train-functions seem to output the same models!")


Training on device cpu.
21:48:19.329496  |  Epoch 1  |  Training loss 0.680
21:48:33.109078  |  Epoch 10  |  Training loss 0.456
21:48:48.863826  |  Epoch 20  |  Training loss 0.388
21:48:51.394392  |  Epoch 1  |  Training loss 0.680
21:49:12.080204  |  Epoch 10  |  Training loss 0.456
21:49:34.543091  |  Epoch 20  |  Training loss 0.388
The train-functions seem to output the same models!


# 8. Training multiple instances with different parameters

In [14]:
#%% 8. Training multiple instances with different parameters

# Setting a few standard variables and empty lists.
models = []
model_names = []
losses_train = []
losses_val = []

# Asking the user for the number of inputs.
answer = int(input("How many models would you like to train? "))

device = (torch.device('cuda') if torch.cuda.is_available()
          else torch.device('cpu'))
print(f"Training on device {device}.")

# This will set the random numbers inside the for loop to not be random for 
# reproducibility. 
np.random.seed(666)

for i in range(1, answer+1):
    
    train_loader = torch.utils.data.DataLoader(cifar2_train, batch_size=256, shuffle=False)
    torch.manual_seed(123)
    model = MyMLP().to(device=device) 
    loss_fn = nn.CrossEntropyLoss()
    
    # The parameters are made such that the model does a random search instead
    # of a grid search as Andrew's videos suggest.
    lr = 10**(-4*np.random.rand())
    weight_decay = 10**(-4*np.random.rand())
    momentum_coeff = 1 - 10**(-4*np.random.rand())
    
    train_manual_update_with_L2_decay_momentum(n_epochs=10,
                                               lr=lr,
                                               model=model,
                                               loss_fn=loss_fn,
                                               train_loader=train_loader,
                                               weight_decay=weight_decay, 
                                               momentum_coeff=momentum_coeff)
    
    # Storing the models for later use.
    models.append(model)
    model_names.append('MyMLP_' + str(i) + ', lr = ' + str(round(lr, 5)) + ', weight_decay = ' + str(round(weight_decay, 5)) + ', momentum_coeff = ' + str(round(momentum_coeff, 5)))
    print(model_names[i-1])

How many models would you like to train? 10
Training on device cpu.
22:03:44.779596  |  Epoch 1  |  Training loss 0.693
22:04:07.025673  |  Epoch 10  |  Training loss 0.480
MyMLP_1, lr = 0.00158, weight_decay = 0.00042, momentum_coeff = 0.99803
22:04:09.522414  |  Epoch 1  |  Training loss 0.699
22:04:29.218217  |  Epoch 10  |  Training loss 0.689
MyMLP_2, lr = 0.00123, weight_decay = 0.00016, momentum_coeff = 0.11042
22:04:31.450573  |  Epoch 1  |  Training loss 0.694
22:04:51.992843  |  Epoch 10  |  Training loss 0.693
MyMLP_3, lr = 0.02216, weight_decay = 0.63789, momentum_coeff = 0.60163
22:04:54.277988  |  Epoch 1  |  Training loss 0.691
22:05:14.069675  |  Epoch 10  |  Training loss 0.593
MyMLP_4, lr = 0.00928, weight_decay = 0.15813, momentum_coeff = 0.99894
22:05:16.296145  |  Epoch 1  |  Training loss 0.718
22:05:36.363058  |  Epoch 10  |  Training loss 0.694
MyMLP_5, lr = 0.16921, weight_decay = 0.00157, momentum_coeff = 0.93284
22:05:38.716337  |  Epoch 1  |  Training loss 0

# 9. Selecting the best model

In [15]:
#%% 9. Selecting the best model
def compute_accuracy(model, loader):
    # This code has been pulled from the Week 06 - Machine Learning pipeline
    # and MLP.
    
    # Setting the model to evaluation mode.
    model.eval()
    correct = 0
    total = 0

    with torch.no_grad():
        for imgs, labels in loader:
            imgs = imgs.to(device=device)
            labels = labels.to(device=device)
                
            # Doing a forward pass of the validation data
            outputs = model(imgs)
            _, predicted = torch.max(outputs, dim=1)
            total += labels.shape[0]
            correct += int((predicted == labels).sum())

    acc =  correct/total
    print("Accuracy: {:.4f}".format(acc))
    return acc

# Setting the train and validation loaders.
train_loader = torch.utils.data.DataLoader(cifar2_train, batch_size=256, shuffle=False)
val_loader = torch.utils.data.DataLoader(cifar2_val, batch_size=256, shuffle=False)

accuracies = []

# Checking the validation accuracy of each trained model.
for model in models:
    accuracies.append(compute_accuracy(model, val_loader))
    
# Get the index of the best model in the model-list.
i_best_model = np.argmax(accuracies)
best_model = models[i_best_model]

# Revealing the best hyperparameters resulting in the best validation accuracy.
print("\nThe best model is ", model_names[i_best_model])

# Printing the training and validation accuracies of the best model.
print("Training accuracy of the best model: ")
compute_accuracy(best_model, train_loader)
print("Validation accuracy of the best model: ")
compute_accuracy(best_model, val_loader)


Accuracy: 0.7375
Accuracy: 0.4954
Accuracy: 0.5046
Accuracy: 0.7508
Accuracy: 0.5158
Accuracy: 0.4954
Accuracy: 0.5046
Accuracy: 0.7538
Accuracy: 0.7518
Accuracy: 0.6460

The best model is  MyMLP_8, lr = 0.00123, weight_decay = 0.04364, momentum_coeff = 0.83782
Training accuracy of the best model: 
Accuracy: 0.7141
Validation accuracy of the best model: 
Accuracy: 0.7538


0.7538148524923703

# 10. Evaluating the best model

In [16]:
#%% 10. Evaluating the best model

# Setting the test loaders.
test_loader = torch.utils.data.DataLoader(cifar2_test, batch_size=256, shuffle=False)

# Printing the test accuracy of the best model.
print("Test accuracy of the best model: ")
compute_accuracy(best_model, test_loader)

Test accuracy of the best model: 
Accuracy: 0.7395


0.7395