In [None]:
import numpy as np
import pandas as pd
import scipy.special as sp
from sklearn.utils import shuffle
import matplotlib.pyplot as plt
import PIL
import PIL.Image

import torch
from torch import nn
from torch.autograd import Variable
from torch import optim
from torchvision.transforms import ToTensor

In [None]:
from torch.utils.data import Dataset
from torch.utils.data import DataLoader

# Data preparation

In [None]:
import os

df = pd.read_csv('../DL_and_NN_in_Python/fer2013.csv')
df = df.sample(frac=1, random_state=42)

In [None]:
emotions = ['Anger', 'Disgust', 'Fear', 'Happy', 'Sad', 'Surprise', 'Neutral']

def show_sample_image(emotion: int):
    emotion_no = emotions.index(emotion)
    df_emotion = df[df.emotion == emotion_no]

    random_no = np.random.random_integers(0, len(df_emotion))
    print(random_no)
    print(df_emotion.iloc[random_no])

    img = np.array(list(map(int, df_emotion.iloc[random_no].pixels.split(' '))), dtype=np.uint8).reshape((48,48))
    img = PIL.Image.fromarray(img).resize((1000, 1000))
    img.show()

show_sample_image('Surprise')

In [None]:
train_proportion = 0.8
train_index = int(train_proportion*len(df))

train_df = df.iloc[:train_index]
test_df = df.iloc[train_index:]

In [None]:
N = len(train_df)
D = len(train_df.iloc[0].pixels.split(' '))
D1 = int(np.sqrt(D))

# or just use train_test_split from sklearn.model_selection for the same effect
n_classes = len(set(train_df.emotion))

print(f'N = {N}, D = {D}, n_classes: {n_classes}')

print(f'Number of samples in training set: {len(train_df)}')
print(f'Number of samples in test set: {len(test_df)}')

Create a custom Dataset object:

In [None]:
class CustomImageDataset(Dataset):
    def __init__(self, data, transform=None, target_transform=None):
        self.data = data
        self.transform = transform
        self.target_transform = target_transform

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        label = self.data.iloc[idx, 0]
        image = self.data.iloc[idx, 1]
        image = list((map(int, image.split(' '))))
        image = np.array(image, dtype=np.uint8) # dtype needs to be np.uint8, otherwise ToTensor() won't scale pixel values to [0.0, 1.0]
        image = image.reshape((D1, D1)) # reshape because ToTensor() expects a 2D image
        
        if self.transform:
            image = self.transform(image)
        if self.target_transform:
            label = self.target_transform(label)
        return image, label

In [None]:
batch_size = 64
train_data = CustomImageDataset(train_df, transform=ToTensor())
train_dataloader = DataLoader(train_data, batch_size=batch_size, shuffle=True)
test_data = CustomImageDataset(test_df, transform=ToTensor())
test_dataloader = DataLoader(test_data, batch_size=batch_size, shuffle=True)

# NNs in PyTorch

The logic behind creating a model in PyTorch is pretty much the same as in Tensorflow. There are some syntactical differences, though.

1. The activation function is not specified as part of the layer, but separately.
2. The input and output dimensions need to be supplied, unlike in Keras, where only output is needed. Note that in pure TF, without Keras, we also need to specify the intput dims unless we postpone the build of the model (see https://www.tensorflow.org/guide/intro_to_modules#waiting_to_create_variables)

    See also this package which can directly infer layer sizes upon passing sample input: https://github.com/szymonmaszke/torchlayers
3. The input has to be explicitly turned into a torch.Tensor (TF can handle that automatically)
4. No explicit ```.fit``` function, need to write the training loop ourselves
5. Custom weight initalisation is not so straightforward
6. L2 regularisation is added to the optimiser, not to the layer. It is usally referred to as ```weight_decay```:

    https://stackoverflow.com/questions/42704283/l1-l2-regularization-in-pytorch
    https://pytorch.org/docs/stable/generated/torch.optim.Adam.html#torch.optim.Adam

    L1 reuglarisation is harder but can be implemented by creating a custom layer:
    https://stackoverflow.com/a/66630301

    

In [None]:
device = (
    "cuda"
    if torch.cuda.is_available()
    else "mps"
    if torch.backends.mps.is_available()
    else "cpu"
)
print(f"Using {device} device")

In [None]:
class ModelPyTorch(torch.nn.Module):
    def __init__(self, M: int = 10):
        super().__init__()
        self.flatten = nn.Flatten()
        self.linear_relu_stack = nn.Sequential(
            nn.Linear(D1*D1, M),
            nn.ReLU(),
            nn.Linear(M, n_classes)
        )
        self.float() # ensures input has the same dtype as the parameters

    def forward(self, x):
        x = self.flatten(x)
        logits = self.linear_relu_stack(x)
        return logits

In [None]:
class ModelPyTorchV2(nn.Module):
    def __init__(self, M: int = 10, activation_function = nn.ReLU()):
        super().__init__()
        self.flatten = nn.Flatten()
        self.layer1 = nn.Linear(D1*D1, M)
        self.layer2 = nn.Linear(M, n_classes)
        self.activation_function = activation_function
        self.float()

    def forward(self, x):
        x = self.flatten(x)
        x = self.activation_function(self.layer1(x))
        x = self.layer2(x)
        return x

In [None]:
def constructNNPyTorch(M: int = 10, activation_function = nn.ReLU()):
    model = nn.Sequential()
    model.add_module('flatten', nn.Flatten())
    model.add_module('name1', nn.Linear(D, M))
    model.add_module('name2', activation_function)
    model.add_module('name3', nn.Linear(M, n_classes))
    model.float()

    return model

Initalise the model an print some information about it. Note that no ```model.compile``` is needed in PyTorch.

In [None]:
model_v1 = ModelPyTorch().to(device)
print(model_v1)
model_v2 = ModelPyTorchV2().to(device)
print(model_v2)
model_v3 = constructNNPyTorch(M=10).to(device)
print(model_v3)

Custom weight initialisation is not as straightforward as in Keras.

- https://pytorch.org/docs/stable/nn.init.html
- https://stackoverflow.com/questions/49433936/how-do-i-initialize-weights-in-pytorch

In [None]:
## takes in a module and applies the specified weight initialization
def weights_init_normal(m):
    '''Takes in a module and initializes all linear layers with weight
        values taken from a normal distribution.'''

    # for every Linear layer in a model
    if isinstance(m, nn.Linear):
        fan_in = m.in_features

        torch.nn.init.normal_(m.weight, 0.0, 1/np.sqrt(fan_in))
        torch.nn.init.zeros_(m.bias)
        # either of these works
        # m.bias.data.fill_(0)
        # m.weight.data.normal_(0.0,1/np.sqrt(fan_in)) 

In [None]:
print('biases before custom initalisation:\n', [module.bias.data for module in model_v1.modules() if isinstance(module, nn.Linear)])
model_v1.apply(weights_init_normal)
print('and after:\n', [module.bias.data for module in model_v1.modules() if isinstance(module, nn.Linear)])

Making predictions:

In [None]:
X = torch.rand(1, D1, D1, device=device, dtype=torch.float)
logits = model_v1(X)
pred_probab = nn.Softmax(dim=1)(logits)
y_pred = pred_probab.argmax(1)
print(f"Predicted class: {y_pred}")

We can print model's parameters. Note that the weights and biases at each layer are automatically given ```requires_grad=True```. If we constructed a computational graph by hand, i.e. without using Sequential, we would have to pass manually specify which parameters 'require grad', i.e. should be included in backpropagation. Switching off gradient tracking can also be done with ```torch.no_grad()```.

In [None]:
for name, param in model_v1.named_parameters():
    print(name)
    print(param.size())
    print(param)
    print('--------------')

Similarly to TF, define a loss function and an optimiser:

In [None]:
loss_function = torch.nn.CrossEntropyLoss(reduction='mean')
optimiser = torch.optim.Adam(model_v1.parameters(), lr=1e-4)

The training step and the loop have to be defined manually:

In [None]:
def training_step(dataloader, model, loss_function, optimiser):
    # sets the model to the triaing mode
    # this is only important if we have batch normalisation or dropout
    # equivalent to TF's training=True option
    model.train() 

    for batch, (X, Y) in enumerate(dataloader):
        loss = loss_function(model(X), Y)

        # calculate the gradient and apply it to the parameters
        loss.backward()
        optimiser.step()
        optimiser.zero_grad() # need to reset the gradient

        if batch % 20 == 0:
            loss_val = loss.item()
            current = batch*batch_size + len(X)
            print(f'Current batch loss: {loss_val}')
            print(f'Current size: {current}')

In [None]:
def test_step(dataloader, model, loss_function):
    # now set the model to evaluation mode to avoid dropout etc.
    model.eval()
    size = len(dataloader.dataset)
    num_batches = len(dataloader)
    test_loss, correct = 0, 0

    # do not compute gradients, because they are not needed at the inference stage
    with torch.no_grad():
        for X, Y in dataloader:
            pred = model(X)
            test_loss += loss_function(pred, Y).item()
            correct += (pred.argmax(axis=1) == Y).type(torch.float).sum().item()

    test_loss /= num_batches
    correct /= size
    print(f"Test Error: \n Accuracy: {(100*correct):>0.1f}%, Avg loss: {test_loss:>8f} \n")

In [None]:
epochs = 10
for t in range(epochs):
    print(f"Epoch {t+1}\n-------------------------------")
    training_step(train_dataloader, model_v1, loss_function, optimiser)
    test_step(test_dataloader, model_v1, loss_function)
print("Done!")