# Task 3: Regularization and Optimization <br/> CC6204 Deep Learning, Universidad de Chile  <br/>
### Name: Humberto Rodrigues 


In [None]:
# This jupyter notebook should be executed in Colab
import torch
import numpy as np
import torch.nn as nn
import plotly.graph_objects as go
import plotly.io as pio
pio.renderers.default='colab'

# Tool for automatic test
!pip install -U "git+https://github.com/dccuchile/CC6204.git@master#egg=cc6204&subdirectory=autocorrect"
from timeit import default_timer as timer

Collecting cc6204
  Cloning https://github.com/dccuchile/CC6204.git (to revision master) to /tmp/pip-install-nj80ys0k/cc6204
  Running command git clone -q https://github.com/dccuchile/CC6204.git /tmp/pip-install-nj80ys0k/cc6204
Building wheels for collected packages: cc6204
  Building wheel for cc6204 (setup.py) ... [?25l[?25hdone
  Created wheel for cc6204: filename=cc6204-0.5.0-cp36-none-any.whl size=5801 sha256=c0392d6d5e0cfa63d063004f7de7f36403b573659b8e7194793b256d055416a8
  Stored in directory: /tmp/pip-ephem-wheel-cache-cii9312g/wheels/62/f0/30/aadcb7ce24a2f9c935890518e902d4e23bf97b80f47bb64414
Successfully built cc6204
Installing collected packages: cc6204
  Found existing installation: cc6204 0.5.0
    Uninstalling cc6204-0.5.0:
      Successfully uninstalled cc6204-0.5.0
Successfully installed cc6204-0.5.0


In [None]:
# Importing test tools
from cc6204 import AutoCorrect, FailedTest

corrector = AutoCorrect(host="cc6204.dcc.uchile.cl", port=443)
token = "]ye/Ox;nsz"

Connection stablished


# Part 1: Regularization and Generalization



## 1a) *Weight Decay* Method
The way decay method is commonly also called the `L2` penalization but instead of being applied to the loss function it is applied in the gradient application.

It consists in a reduction of the value of each parameter before applying the gradient during the back propagation algorithm. This reduction is based on the `weight decay` parameter or `beta` how it is called in this examples. The main effect of this is that the parameters will be always triying to be smaller even if we are suffering of `vanishing gradient`. This reduction is not a light take action, because a big value for `beta` can easily nulify the application of the gradient avoiding the learning.

In [None]:
class SGD():
  def __init__(self, parameters, lr, beta=0):
    # Assuming the call will be done with model.parameters()
    self.parameters = [p for p in parameters]
    self.lr = lr
    self.beta = beta
  
  def step(self):
    for p in self.parameters:
      if p.grad is not None:
        p.data = ((1 - self.beta) * p.data) - (self.lr * p.grad)

In [None]:
# Tests del API del curso
weight, grad = corrector.get_test_data(homework=3, question="1a", test=1, token=token)

weight = torch.tensor(weight, requires_grad=True)
weight.grad = torch.tensor(grad)

optimizer = SGD([weight], lr=0.1, beta=0.1)
optimizer.step()

# Submit
corrector.submit(homework=3, question="1a", test=1, token=token, answer=weight)

Correct Test!


## 1b) Inverted Dropout method
This is one of the most common techniques to improve models generalization. It is consist in randomly `turning off` some neurons based on a probability. The idea is to guide the model to learn more efectives and distincts ways to clasify the same data.

In [None]:
# This is the most common approach to implement the derivative expressions
# the gradient parameter is use to determine if should be evaluated
# the function itself or it's corresponding derivative

def tanh(T):
  E = torch.exp(T)
  e = torch.exp(-1 * T)
  return (E - e) * torch.reciprocal(E + e)

def tanh_dx(T):
  return 1 - tanh(T)^2

def softmax(T, dim=1, stable=True):
  maxes = torch.max(T, dim, keepdim=True)[0] if stable else 0
  x_exp = torch.exp(T-maxes)
  x_exp_sum = torch.sum(x_exp, dim, keepdim=True)
  return x_exp/x_exp_sum

def CELoss(Q, P, estable=True, epsilon=1e-8):
  if estable:
    Q[Q < epsilon] = epsilon
  return torch.mean(-torch.sum(P*torch.log(Q),-1))

def relu(T):
  return torch.max(T,torch.zeros(T.size(),device=T.device))

def swish(T, beta=1.0):
  return T * sig(beta * T)

def celu(T, alpha=1.0):
  zero_T = torch.zeros(T.size(),device=T.device)
  return torch.max(T,zero_T) + torch.min(alpha * (torch.exp(T/alpha) - 1),zero_T)

def sig(T, gradient=False):
  if gradient:
    sigT = sig(T)
    return sigT * (1 - sigT)
  return torch.reciprocal(1 + torch.exp(-1 * T))

def relu_dx(T):
  X = T.clone()
  X[X >= 0] = 1
  X[X < 0] = 0
  return X

def swish_dx(T, beta=1.0):
  X = T.clone()
  sig_bx = sig(beta*X)
  return sig_bx + beta * X * sig_bx * (1 - sig_bx)


def swish_db(T, beta=1.0):
  X = T.clone()
  sig_bx = sig(beta*X)
  return  X * X * sig_bx * (1 - sig_bx)
       
 
def celu_dx(T, alpha=1.0):
  X = T.clone()
  X[X >= 0] = 0
  x_alpha = X[X < 0] / alpha
  X[X < 0] = torch.exp(x_alpha)
  return X
                      
def celu_da(T, alpha=1.0):
  X = T.clone()
  X[X >= 0] = 0
  x_alpha = X[X < 0] / alpha
  X[X < 0] = (torch.exp(x_alpha) * (1 - x_alpha)) - 1
  return X

activation_dx = {"relu": relu_dx, "celu": celu_dx, "swish": swish_dx}
activation_dp = {"celu": celu_da, "swish": swish_db}

In [None]:
class FFNN(torch.nn.Module):
  def create_extra_parameters(self, extra):
    if extra:
      res = []
      for p in extra:
        if p:
          res.append(nn.Parameter(torch.tensor(p)))
        else:
          res.append(nn.Parameter(None))
      self.extra_params = nn.ParameterList(res)
    else:
      self.extra_params = None

  def create_parameters(self, h_l, l_a, C, extra=None, initial_weights=None, init_method=None):
    internal_layers = []
    internal_biases = []
    for i in range(0, len(h_l)-1):
      if initial_weights:
        # The structure with initial weights for the `lth` layer will be (W,b)
        # We need to ensure that the dimensions of the data MATCH with the model
        current_weights = initial_weights[i][0]
        current_bias = initial_weights[i][1]
        if len(current_weights) != h_l[i] or len(current_weights[0]) != h_l[i+1] or len(current_bias) != h_l[i+1]:
          raise Exception("Dimension mismatch")
        else:
          internal_layers.append(nn.Parameter(current_weights))
          internal_biases.append(nn.Parameter(current_bias))
      else:
        
        # if not init method is specified the 'best' one is choosen
        # based on the activation function of the ith layer
        if init_method is None:
          init_method = xavier_init if l_a[i].__name__ != "relu" else he_init

        internal_layers.append(nn.Parameter(init_method(h_l[i],h_l[i+1])))
        internal_biases.append(nn.Parameter(torch.zeros(h_l[i+1])))

    self.h_weights = nn.ParameterList(internal_layers)
    self.biases = nn.ParameterList(internal_biases)
    self.activation_functions = l_a[:-1]

    self.create_extra_parameters(extra)

    # Output Layer, bias (c), and softmax 
    # It is important to notice the use of ParameterList also for this layer
    # This is because the nn.Parameter directly is not being reflected in the 
    # print(model) method and that is the one being used for the summary 

    if initial_weights:
      # Is expected that the last position of `initial_weights` is (U,c)
      output_weights = initial_weights[-1][0]
      output_bias = initial_weights[-1][1]

      if len(output_weights) != h_l[-1] or len(output_weights[0]) != C or len(output_bias) != C:
          raise Exception("Dimension mismatch")
      else:
        self.output_weights = nn.ParameterList([nn.Parameter(output_weights)])
        self.output_bias = nn.ParameterList([nn.Parameter(output_bias)])
    else:
      
      # for the last layer if there is no init method specified xavier is de default one
      if init_method is None:
          init_method = xavier_init

      self.output_weights = nn.ParameterList([nn.Parameter(init_method(h_l[-1],C))])
      self.output_bias = nn.ParameterList([nn.Parameter(torch.zeros(C))])
    self.output_function = l_a[-1]

  def __init__(self, F, l_h, l_a, C, extra=None, initial_weights=None, keep_prob=None, init_method=None):
    super(FFNN, self).__init__()
    
    if len(l_h)+1 != len(l_a):
      raise Exception("Dimension mismatch")
    if extra and len(extra) != len(l_a):
      raise Exception("Dimension mismatch")
    if initial_weights and len(initial_weights) != (len(l_h) + 1):
      print(len(initial_weights),(len(l_h) + 1))
      raise Exception("Dimension mismatch")
    if keep_prob and len(keep_prob) != (len(l_h) + 1):
      raise Exception("Dimension mismatch")

    self.create_parameters([F]+l_h,l_a,C,extra,initial_weights,init_method)
    self.keep_prob = keep_prob
  
  def summary(self):
    
    # In this case the summary is being show using the string method defined in the class
    # because is easy to see the details of all the internal parameters
    # also the use of this string method is the reason of addding output weights
    # as a paremetertList to be able to see it Listed here. 

    print(f"#######MODEL SUMMARY#######\n")
    print(f"Input Layer size: {self.h_weights[0].size()[0]}\n")
    print("Internal params detail:")
    print(f"{self}")
    print(f"Activation Functions{[f.__name__ for f in self.activation_functions+[self.output_function]]}\n")
    print(f"Output Layer size: {self.output_weights[0].size()[1]}\n")
    print(f"Total trainable Parameters: { sum(p.numel() for p in self.parameters() if p.requires_grad)}")
    print(f"Total Parameters: { sum(p.numel() for p in self.parameters())}\n")
    print(f"########SUMMARY END########\n\n")
  
  def generate_dropout_mask(self, input, current_keep_prob):

    # Assuming that the first dimension is always the batch size
    batch_size = input.size()[0]

    # Inverted dropout mask
    current_mask = torch.bernoulli(torch.full((batch_size,1),current_keep_prob,device=input.device)) / current_keep_prob
    return current_mask
  
  def forward(self, x, predict=False, output_layer=None):
    cache = {"hs":[], "us":[], "ms": [] }
    for i in range(0, len(self.h_weights)):

      # Verify if we have Dropout activated
      if self.keep_prob and not predict:
        current_mask = self.generate_dropout_mask(x,self.keep_prob[i])
        x = x * current_mask
        cache["ms"].append(current_mask)
      
      # How we are applying the droput at the beggining of each layer
      # We need to add 1 to the output_layer comparison
  
      if output_layer is not None and (i == (output_layer + 1)):     
        return x

      # Validation to retrieve the possible extra params for the i_th activation function
      current_extras = [self.extra_params[i]] if self.extra_params and not self.extra_params[i].nelement() == 0 else []

      u = x @ self.h_weights[i] + self.biases[i]
      
      # x = h for practical effects we are just updating the input variable
      x = self.activation_functions[i](u,*current_extras)

      cache["hs"].append(x)
      cache["us"].append(u)
    

    # Verify if we have Dropout activated for the last hidden layer
    if self.keep_prob and not predict:
      current_mask = self.generate_dropout_mask(x,self.keep_prob[-1])
      x = x * current_mask
      cache["ms"].append(current_mask)
    
    if output_layer is not None and (len(self.h_weights) == (output_layer)):   
      return x

    # Validation to retrieve the possible extra params for the output activation function
    output_extras = [self.extra_params[-1].data] if self.extra_params and not self.extra_params[-1].nelement() == 0 else []

    # y' = x * U + c
    x = self.output_function(torch.matmul(x,self.output_weights[0]) + self.output_bias[0],*output_extras)
    self.cache = cache
    return x

  def backward(self, x, y, y_pred):
    b = x.size()[0]

    dl_du = (1/b) * (y_pred - y)

    self.output_weights[0].grad = self.cache["hs"][-1].t() @ dl_du
    self.output_bias[0].grad = torch.sum(dl_du, 0)

    current_dl_dh = dl_du @ self.output_weights[-1].t()

    if self.keep_prob:
      current_dl_dh *= self.cache["ms"][-1]

    for i in range(len(self.h_weights)-1,-1,-1):
      # extract extra param for activation function in layer i
      current_extras = [self.extra_params[i].data] if self.extra_params and not self.extra_params[i].nelement() == 0 else []

      # calculating dl_du^k
      current_dl_du = current_dl_dh * activation_dx[self.activation_functions[i].__name__](self.cache["us"][i],*current_extras)

      # in case there is a trainable parameter we calculate it is grad
      if current_extras:
        
        res = torch.mean(
          torch.flatten(
            (
              current_dl_dh * activation_dp[self.activation_functions[i].__name__](self.cache["us"][i],*current_extras)
            )
          )
        )
        current_extras[0].grad = res
      # special condition to set x as the first h_i
      h_k = self.cache["hs"][i-1] if i > 0 else x
      
      # dl_dw^k
      self.h_weights[i].grad = h_k.t() @ current_dl_du
      
      #dl_db^k
      self.biases[i].grad =  torch.sum(current_dl_du, 0)

      #dl_dh^(k-1)
      current_dl_dh = current_dl_du @ self.h_weights[i].t()

      # We use the stored dropout mask to multiply the corresponding gradient
      if self.keep_prob:
        current_dl_dh *= self.cache["ms"][i]


In [None]:
demo_model = FFNN(300, [50,30], [relu,relu,softmax], 10, keep_prob=[1.0, 0.5, 0.7],init_method=torch.randn)
demo_input = torch.randn(25,300)
demo_output = torch.randn(25,10)
y_hat = demo_model(demo_input)
demo_model.backward(demo_input,demo_output,y_hat)


Setting attributes on ParameterList is not supported.



In [None]:
# Tests del API del curso
torch.manual_seed(0)
sample = torch.rand(1, 10)
red = FFNN(10, [1000], [sig,softmax], 1, keep_prob=[1.0, 0.5],init_method=torch.randn)
y = red(sample, output_layer=0)
output_mask = (y == 0)
percent = torch.sum(output_mask).item() / list(output_mask.size())[-1]

# Submit
corrector.submit(homework=3, question="1b", test=1, token=token, answer=percent)


Setting attributes on ParameterList is not supported.



Correct Test!


## 1c) Testing with MNIST 

In [None]:
from torchvision.datasets import MNIST
from torchvision.transforms import ToTensor
from torch.utils.data import Dataset, DataLoader


# Downloading and storing MNIST dataset in trainable mode.
dataset_train = MNIST('mnist', train=True, transform=ToTensor(), download=True)
dataset_test = MNIST('mnist', train=False, transform=ToTensor(), download=True)
print(f'Total data train: {len(dataset_train)} test: {len(dataset_test)}',)

Total data train: 60000 test: 10000


In [None]:
def plot_results(loss_data, acc_data):
  loss_fig = go.Figure()
  for result in loss_data:
    x_axis = result["x"]
    y_axis = result["y"]
    loss_fig.add_scatter(
        x=x_axis,
        y=y_axis,
        mode="lines+markers", 
        textposition="bottom center",
        name=result["name"]
    )
  loss_fig.update_layout(
      autosize=False,
      width=700,
      height=450,
      title="Loss Chart",
      xaxis_title="Epochs",
      yaxis_title="Loss",
      font=dict(
          family="Courier New, monospace",
          size=14,
          color="#7f7f7f"
      )
  )
  loss_fig.show()
  print("\n")
  acc_fig = go.Figure()
  for result in acc_data:
    x_axis = result["x"]
    y_axis = result["y"]
    acc_fig.add_scatter( 
        x=x_axis,
        y=y_axis,
        mode="lines+markers", 
        textposition="bottom center",
        name=result["name"]
    )
  acc_fig.update_layout(
      autosize=False,
      width=700,
      height=450,
      title="Accuracy Chart",
      xaxis_title="Epochs",
      yaxis_title="Acc",
      font=dict(
          family="Courier New, monospace",
          size=14,
          color="#7f7f7f"
      )
  )
  acc_fig.show()

def custom_collate(batch):
  x_data = [x[0].view(28*28) for x in batch ]

  y_data = []

  for t in batch:
    ct = torch.zeros(10)
    ct[t[1]] = 1
    y_data.append(ct)
  return torch.stack(x_data),torch.stack(y_data)

In [None]:
def train(network, dataset, optimizer, epochs=1, batch_size=1, device='cuda', collate=None):
  network.to(device)
  data = DataLoader(dataset,batch_size=batch_size,shuffle=True,collate_fn=collate)
  loss, acc = [], []
  for e in range(1,epochs+1):
    acc_c = 0
    loss_c = 0
    for x, y in data:
      x, y = x.to(device).float(), y.to(device)
      y_pred = network(x)
      loss_c += CELoss(y_pred,y).item()
      acc_c += torch.sum((torch.argmax(y_pred, 1) == torch.argmax(y,1))).item()
      network.backward(x,y,y_pred)
      optimizer.step()
    
    print(f"Epoch: {e} | Acc: {acc_c/len(dataset)*100:.2f}% | loss: {loss_c/len(dataset)*batch_size:.3f}")
    acc.append(acc_c/len(dataset))
    loss.append(loss_c/len(dataset)*batch_size)
  return loss, acc

In [None]:
def test(network, dataset, batch_size=1, device='cuda', collate=None):
  network.to(device)
  data = DataLoader(dataset,batch_size=batch_size,shuffle=True,collate_fn=collate)
  acc_c = 0
  loss_c = 0
  for x, y in data:
    x, y = x.to(device).float(), y.to(device)
    y_pred = network(x,predict=True)
    loss_c += CELoss(y_pred,y).item()
    acc_c += torch.sum((torch.argmax(y_pred, 1) == torch.argmax(y,1))).item()
  print(f"\nTest Results: Acc: {acc_c/len(dataset)*100:.2f}% | loss: {loss_c/len(dataset)*batch_size:.3f}")
  return loss_c/len(dataset)*batch_size, acc_c/len(dataset)

Since now we will be using the same network structure for all our tests. This is a powerful architecture capable of achieve 70+ accuracy in this specific dataset. Over it we will start adding some regularization techniques and we will analyze how the results change in base of our metrics and the training time

The structure we will be using as base will be:

```python
FFNN(784,[512,1024,128],[relu,celu,swish, softmax],10,init_method=torch.randn)
```

### Baseline model

In [None]:
# Base model structure training and test
model = FFNN(784,[512,1024,128],[relu,celu,swish, softmax],10,init_method=torch.randn)
model.to("cuda")
optimizer = SGD(model.parameters(), lr=1e-5)

with torch.no_grad():
  loss, acc = train(model, dataset_train, optimizer,epochs=20,batch_size=200,collate=custom_collate)
  test_loss, test_acc = test(model,dataset_test,batch_size=200,collate=custom_collate)

  # Formatting results to plot
  loss_data = {
      "x": [x+1 for x in range(20)],
      "y": loss,
      "name": "Baseline model Loss"
  }

  acc_data = {
      "x": [x+1 for x in range(20)],
      "y": acc,
      "name": "Baseline model Acc"
  }
  


Setting attributes on ParameterList is not supported.



Epoch: 1 | Acc: 23.08% | loss: 14.166
Epoch: 2 | Acc: 46.30% | loss: 9.883
Epoch: 3 | Acc: 56.75% | loss: 7.960
Epoch: 4 | Acc: 62.29% | loss: 6.941
Epoch: 5 | Acc: 65.95% | loss: 6.266
Epoch: 6 | Acc: 68.55% | loss: 5.787
Epoch: 7 | Acc: 70.38% | loss: 5.450
Epoch: 8 | Acc: 71.89% | loss: 5.173
Epoch: 9 | Acc: 73.16% | loss: 4.941
Epoch: 10 | Acc: 74.12% | loss: 4.762
Epoch: 11 | Acc: 74.97% | loss: 4.606
Epoch: 12 | Acc: 75.68% | loss: 4.474
Epoch: 13 | Acc: 76.32% | loss: 4.359
Epoch: 14 | Acc: 76.84% | loss: 4.260
Epoch: 15 | Acc: 77.42% | loss: 4.154
Epoch: 16 | Acc: 77.81% | loss: 4.081
Epoch: 17 | Acc: 78.23% | loss: 4.004
Epoch: 18 | Acc: 78.61% | loss: 3.935
Epoch: 19 | Acc: 78.85% | loss: 3.891
Epoch: 20 | Acc: 79.20% | loss: 3.825

Test Results: Acc: 79.49% | loss: 3.773


In [None]:
# Baseline model test vs train metrics
print(f"Difference between best train loss and test loss: {abs(np.min(loss)-test_loss):.3f}")
print(f"Difference between best train acc and test acc: {abs(np.max(acc)-test_acc)*100:.3f}%\n") 
plot_results([loss_data],[acc_data])

Difference between best train loss and test loss: 0.052
Difference between best train acc and test acc: 0.293%







### Adding Inverted Dropout


In [None]:
# Test model using Inverted Dropout
model = FFNN(784,[512,1024,128],[relu,celu,swish, softmax],10,init_method=torch.randn,keep_prob=[1.0,0.8,0.8,0.9])
model.to("cuda")
optimizer = SGD(model.parameters(), lr=1e-5)

with torch.no_grad():
  loss, acc = train(model, dataset_train, optimizer,epochs=40,batch_size=200,collate=custom_collate)
  test_loss, test_acc = test(model,dataset_test,batch_size=200,collate=custom_collate)

  # Formatting results to plot
  loss_data = {
    "x": [x+1 for x in range(40)],
    "y": loss,
    "name": "Dropout model Loss"
  }

  acc_data = {
    "x": [x+1 for x in range(40)],
    "y": acc,
    "name": "Dropout model Acc"
  }



Setting attributes on ParameterList is not supported.



Epoch: 1 | Acc: 22.97% | loss: 8.187
Epoch: 2 | Acc: 35.25% | loss: 6.022
Epoch: 3 | Acc: 40.85% | loss: 4.987
Epoch: 4 | Acc: 44.06% | loss: 4.339
Epoch: 5 | Acc: 45.89% | loss: 3.970
Epoch: 6 | Acc: 47.43% | loss: 3.712
Epoch: 7 | Acc: 48.47% | loss: 3.518
Epoch: 8 | Acc: 49.36% | loss: 3.322
Epoch: 9 | Acc: 50.16% | loss: 3.240
Epoch: 10 | Acc: 50.40% | loss: 3.120
Epoch: 11 | Acc: 50.47% | loss: 3.061
Epoch: 12 | Acc: 51.57% | loss: 3.002
Epoch: 13 | Acc: 51.70% | loss: 2.940
Epoch: 14 | Acc: 52.08% | loss: 2.907
Epoch: 15 | Acc: 52.05% | loss: 2.862
Epoch: 16 | Acc: 52.11% | loss: 2.806
Epoch: 17 | Acc: 52.52% | loss: 2.797
Epoch: 18 | Acc: 52.75% | loss: 2.746
Epoch: 19 | Acc: 52.85% | loss: 2.720
Epoch: 20 | Acc: 53.15% | loss: 2.708
Epoch: 21 | Acc: 53.34% | loss: 2.665
Epoch: 22 | Acc: 53.52% | loss: 2.663
Epoch: 23 | Acc: 53.46% | loss: 2.676
Epoch: 24 | Acc: 53.61% | loss: 2.652
Epoch: 25 | Acc: 53.41% | loss: 2.634
Epoch: 26 | Acc: 53.38% | loss: 2.656
Epoch: 27 | Acc: 53.4

In [None]:
# DROPOUT model test vs train metrics
print(f"Difference between best train loss and test loss: {abs(np.min(loss)-test_loss):.3f}")
print(f"Difference between best train acc and test acc: {abs(np.max(acc)-test_acc)*100:.3f}%\n") 
plot_results([loss_data], [acc_data])

Difference between best train loss and test loss: 0.182
Difference between best train acc and test acc: 31.305%







### Adding weight Decay

In [None]:
# Test model using Inverted Dropout
model = FFNN(784,[512,1024,128],[relu,celu,swish, softmax],10,init_method=torch.randn)
model.to("cuda")
optimizer = SGD(model.parameters(), lr=1e-5, beta=1e-4)

with torch.no_grad():
  loss, acc = train(model, dataset_train, optimizer,epochs=20,batch_size=200,collate=custom_collate)
  test_loss, test_acc = test(model,dataset_test,batch_size=200,collate=custom_collate)
  
  # Formatting results to plot
  loss_data = {
    "x": [x+1 for x in range(20)],
    "y": loss,
    "name": "Dropout model Loss"
  }

  acc_data = {
    "x": [x+1 for x in range(20)],
    "y": acc,
    "name": "Dropout model Acc"
  }


Setting attributes on ParameterList is not supported.



Epoch: 1 | Acc: 24.61% | loss: 13.884
Epoch: 2 | Acc: 47.48% | loss: 9.668
Epoch: 3 | Acc: 57.47% | loss: 7.827
Epoch: 4 | Acc: 62.49% | loss: 6.901
Epoch: 5 | Acc: 65.52% | loss: 6.343
Epoch: 6 | Acc: 67.78% | loss: 5.925
Epoch: 7 | Acc: 69.35% | loss: 5.637
Epoch: 8 | Acc: 70.54% | loss: 5.414
Epoch: 9 | Acc: 71.59% | loss: 5.216
Epoch: 10 | Acc: 72.43% | loss: 5.065
Epoch: 11 | Acc: 73.09% | loss: 4.937
Epoch: 12 | Acc: 73.69% | loss: 4.828
Epoch: 13 | Acc: 74.08% | loss: 4.751
Epoch: 14 | Acc: 74.58% | loss: 4.655
Epoch: 15 | Acc: 74.99% | loss: 4.575
Epoch: 16 | Acc: 75.28% | loss: 4.520
Epoch: 17 | Acc: 75.57% | loss: 4.458
Epoch: 18 | Acc: 75.85% | loss: 4.405
Epoch: 19 | Acc: 76.11% | loss: 4.345
Epoch: 20 | Acc: 76.34% | loss: 4.303

Test Results: Acc: 77.62% | loss: 4.071


In [None]:
# Weight Decay model test vs train metrics
print(f"Difference between best train loss and test loss: {abs(np.min(loss)-test_loss):.3f}")
print(f"Difference between best train acc and test acc: {abs(np.max(acc)-test_acc)*100:.3f}%\n") 
plot_results([loss_data], [acc_data])

Difference between best train loss and test loss: 0.232
Difference between best train acc and test acc: 1.282%







### Combining Inverted Dropout + Weight Decay

In [None]:
# Test model using Inverted Dropout + Weight Decay
model = FFNN(784,[512,1024,128],[relu,celu,swish, softmax],10,init_method=torch.randn,keep_prob=[1.0,0.8,0.8,0.9])
model.to("cuda")
optimizer = SGD(model.parameters(), lr=1e-5, beta=1e-5)

with torch.no_grad():
  loss, acc = train(model, dataset_train, optimizer,epochs=30,batch_size=200,collate=custom_collate)
  test_loss, test_acc = test(model,dataset_test,batch_size=200,collate=custom_collate)
  
  # Formatting results to plot
  loss_data = {
    "x": [x+1 for x in range(30)],
    "y": loss,
    "name": "Dropout model Loss"
  }

  acc_data = {
    "x": [x+1 for x in range(30)],
    "y": acc,
    "name": "Dropout model Acc"
  }



Setting attributes on ParameterList is not supported.



Epoch: 1 | Acc: 19.73% | loss: 8.753
Epoch: 2 | Acc: 33.70% | loss: 6.251
Epoch: 3 | Acc: 39.70% | loss: 5.140
Epoch: 4 | Acc: 43.01% | loss: 4.502
Epoch: 5 | Acc: 45.56% | loss: 4.089
Epoch: 6 | Acc: 46.66% | loss: 3.845
Epoch: 7 | Acc: 48.38% | loss: 3.625
Epoch: 8 | Acc: 49.01% | loss: 3.418
Epoch: 9 | Acc: 49.73% | loss: 3.309
Epoch: 10 | Acc: 50.52% | loss: 3.210
Epoch: 11 | Acc: 50.76% | loss: 3.129
Epoch: 12 | Acc: 50.97% | loss: 3.063
Epoch: 13 | Acc: 51.86% | loss: 2.938
Epoch: 14 | Acc: 52.02% | loss: 2.901
Epoch: 15 | Acc: 52.20% | loss: 2.873
Epoch: 16 | Acc: 52.29% | loss: 2.840
Epoch: 17 | Acc: 52.39% | loss: 2.794
Epoch: 18 | Acc: 52.81% | loss: 2.762
Epoch: 19 | Acc: 52.64% | loss: 2.733
Epoch: 20 | Acc: 53.05% | loss: 2.683
Epoch: 21 | Acc: 53.16% | loss: 2.696
Epoch: 22 | Acc: 52.85% | loss: 2.667
Epoch: 23 | Acc: 53.43% | loss: 2.622
Epoch: 24 | Acc: 53.30% | loss: 2.594
Epoch: 25 | Acc: 53.65% | loss: 2.621
Epoch: 26 | Acc: 53.74% | loss: 2.575
Epoch: 27 | Acc: 53.7

In [None]:
# Inverted Dropout + Weight Decay model test vs train metrics
print(f"Difference between best train loss and test loss: {abs(np.min(loss)-test_loss):.3f}")
print(f"Difference between best train acc and test acc: {abs(np.max(acc)-test_acc)*100:.3f}%\n") 
plot_results([loss_data], [acc_data])

Difference between best train loss and test loss: 0.097
Difference between best train acc and test acc: 31.770%







### Results

The first thing to notice after applying these regularization methods is the increment in the time needed in the convergency process of the model, that is the main reason we apply a different number of epochs to train the different tests, specifically the `dropout` method requires more training and this time is deeply connected to how aggresives are the probabilities to drop neurons. Another important fact is that the estimation of those `keep_probabilities` can be easily interpreted as hyper-parameters of the model and are also related to the `width` of the layers where it will be applied. In other words, more aggresively dropout chances can be applied in `wider` layers. 

In terms of advantages, we can notice a really big `generalization leap` (based in accuracy) between the model using dropout and the baseline, one which is the main reason of applying this approach. In the other hand in the charts is also visible that the curves are not smooth as the baseline ones this means that closer to the convergency point there is more `back` and `forth` descending through the loss.

Regarding the `Weight decay` method there is no much more time added to the convergency but there are different facts to mention. The main one is the difficulty choosing a value for the `beta` parameter which is also a new hyper-parameter for a model. A big value (less than `1` of course ) can cause a tremendous reduction of original content of the parameter causing in some cases that the application of the grad does not have effect at all. If beta is close to epsilon then almost no `weigth decay` is  being applied. 
  

# Part 2: Optimization algorithms

## 2a) Xavier and He initialization methods

A common issue in the fully connected neural networks is the gradient vanishing problem which can be understood as the tendency of the gradients to approach 0 in the backward process due to continuous multiplications. A possible approach to amortize this effect is to set initial weights with specific properties, two of the most common initialization schemas are `xavier` and `he` based in the activation function of the layer. In general `he` method is used when the activation function is `relu` otherwise `xavier` is used. 

In [None]:
import math

def xavier_init(first_dim, second_dim, r=None):
  if r is None:
    r = torch.randn((first_dim,second_dim))
  return r * (math.sqrt(1/first_dim))

# Variation of xavier using a uniform distribution for r instead of a normal one
def xavier_init_uniform(first_dim, second_dim, r=None):
  if r is None:
    r = torch.rand((first_dim,second_dim))
  return r * (math.sqrt(1/first_dim))

def he_init(first_dim, second_dim, r=None):
  if r is None:
    r = torch.randn((first_dim,second_dim))
  return r * (math.sqrt(2/first_dim))

def he_init_uniform(first_dim, second_dim, r=None):
  if r is None:
    r = torch.rand((first_dim,second_dim))
  return r * (math.sqrt(2/first_dim))

In [None]:
# Running xavier and he automatic tests
r_xavier = corrector.get_test_data(homework=3, question="2a", test=1, token=token)
r_he = corrector.get_test_data(homework=3, question="2a", test=2, token=token)

w_xavier = xavier_init(50, 50, torch.tensor(r_xavier))
w_he = he_init(50, 50, torch.tensor(r_he))

corrector.submit(homework=3, question="2a", test=1, token=token, answer=w_xavier)
corrector.submit(homework=3, question="2a", test=2, token=token, answer=w_he)

Correct Test!
Correct Test!


### **Important**:
We will be apllying this schemas in our `FFNN` class by adding a new parameter in the constructor called `init_method`. The expected value for this parameter is the python function that will be executed to generate the initial tensors.

example:

```python
# This will set all the initial weights with `xavier` method
model = FFNN(784,[512,1024,128],[relu,relu,relu, softmax],10,init_method=xavier_init)

# This will set all the initial weights with `he` method
model = FFNN(784,[512,1024,128],[relu,relu,relu, softmax],10,init_method=he_init)

# It is important to notice we can obtaing the same results
# we had before applying this heuristics passing randn as the init method
model = FFNN(784,[512,1024,128],[relu,relu,relu, softmax],10,init_method=torch.randn)

# Passing init_method as None will result in applying this two functions based of the activation layer
# This is the heuristic way we described before
model = FFNN(784,[512,1024,128],[relu,relu,relu, softmax],10)

```

## 2b) Stochastic gradient descend with momentum

In [None]:
# SGD With momentum
class SGD():
  def __init__(self, parameters, lr=0.001, momentum=0):

    # Assuming the call will be done with model.parameters()
    self.parameters = [p for p in parameters if p is not None]
    self.lr = lr
    self.momentum = momentum
    self.V = [ torch.zeros_like(p) for p in self.parameters if p is not None ]
  
  def step(self):
    for idx, p in enumerate(self.parameters):
      if p.grad is not None:
        self.V[idx].data = (self.momentum * self.V[idx].data) - (self.lr * p.grad)
        p.data = p.data + self.V[idx].data

In [None]:
# Tests del API del curso
weight, grad = corrector.get_test_data(homework=3, question="2b", test=1, token=token)

weight = torch.tensor(weight, requires_grad=True)
weight.grad = torch.tensor(grad)

optimizer = SGD([weight], lr=0.1, momentum=0.9)
optimizer.step()

# Submit
corrector.submit(homework=3, question="2b", test=1, token=token, answer=weight)
optimizer.step()
corrector.submit(homework=3, question="2b", test=2, token=token, answer=weight)

Correct Test!
Correct Test!


## 2c) RMSProp

In [None]:
class RMSProp():
  def __init__(self, parameters, lr=0.001, beta=0.9, epsilon=1e-8):

    # Assuming the call will be done with model.parameters()
    self.parameters = [p for p in parameters if p is not None]

    # Commonly called lambda in the formulations
    self.lr = lr

    # beta will be the weighting value for the EMA
    self.beta = beta

    # small computational trick to avoid divisions by 0
    self.epsilon = epsilon

    # EMA accumulator 
    self.S = [ torch.zeros_like(p) for p in self.parameters if p is not None ]
  
  def step(self):
    for idx, p in enumerate(self.parameters):
      if p.grad is not None:
        self.S[idx].data = (self.beta * self.S[idx].data) + ((1-self.beta) * (p.grad * p.grad))
        p.data = p.data - ((self.lr/(self.epsilon + torch.sqrt(self.S[idx].data)))* p.grad)

In [None]:
# Tests del API del curso
weight, grad = corrector.get_test_data(homework=3, question="2c", test=1, token=token)

weight = torch.tensor(weight, requires_grad=True)
weight.grad = torch.tensor(grad)

optimizer = RMSProp([weight], lr=0.001, beta=0.9, epsilon=1e-8)
optimizer.step()

# Submit
corrector.submit(homework=3, question="2c", test=1, token=token, answer=weight)
optimizer.step()
corrector.submit(homework=3, question="2c", test=2, token=token, answer=weight)

Correct Test!
Correct Test!


## 2d) Adam

In [None]:
class Adam():
  def __init__(self, parameters, lr=0.001, beta1=0.9, beta2=0.999, epsilon=1e-8):

    # Assuming the call will be done with model.parameters()
    self.parameters = [p for p in parameters if p is not None]

    # Commonly called lambda in the formulations
    self.lr = lr

    # beta1 will be the weighting value for P
    self.beta1 = beta1

    # beta1 will be the weighting value for S
    self.beta2 = beta2

    # small computational trick to avoid divisions by 0
    self.epsilon = epsilon

    # P EMA accumulator 
    self.P = [ torch.zeros_like(p) for p in self.parameters if p is not None ]

    # S EMA accumulator 
    self.S = [ torch.zeros_like(p) for p in self.parameters if p is not None ]

    # Represent the nummber of accumulations commonly called t
    self.n = 1
  
  def step(self):
    for idx, p in enumerate(self.parameters):
      if p.grad is not None:
        self.P[idx].data = (self.beta1 * self.P[idx].data) + ((1-self.beta1) * p.grad)
        self.S[idx].data = (self.beta2 * self.S[idx].data) + ((1-self.beta2) * (p.grad * p.grad))

        P_fixed = self.P[idx].data / (1 - (self.beta1**self.n))
        S_fixed = self.S[idx].data / (1 - (self.beta2**self.n))

        p.data = p.data - ((self.lr/(self.epsilon + torch.sqrt(S_fixed)))* P_fixed)

        self.n += 1

In [None]:
# Tests del API del curso
weight, grad = corrector.get_test_data(homework=3, question="2d", test=1, token=token)

weight = torch.tensor(weight, requires_grad=True)
weight.grad = torch.tensor(grad)

optimizer = Adam([weight], lr=0.001, beta1=0.9, beta2=0.999, epsilon=1e-8)
optimizer.step()

# Submit
corrector.submit(homework=3, question="2d", test=1, token=token, answer=weight)
optimizer.step()
corrector.submit_check_some(homework=3, question="2d", tests=[2, 3], token=token,
                            answer_dict={2: weight, 3: weight}, required_number=1)

Correct Test!
Correct Test!


## 2e) Experiments using MNIST (Powerful ones)

In this sections we will be testing the same netwrok structure in terms of capacity, learn rate and the same dataset with different optimization algorithms and initialization methods.



### Using Fixed **Xavier** initialization:

In [None]:
fixed_xavier_data_to_plot_loss = []
fixed_xavier_data_to_plot_acc = []

In [None]:
# SGD + momentum test
model = FFNN(784,[512,1024,128],[relu,celu,swish, softmax],10,init_method=xavier_init)
model.to('cuda')
optimizer = SGD(model.parameters(),lr=1e-3,momentum=0.9)
with torch.no_grad():
  loss, acc = train(model, dataset_train, optimizer,epochs=10,batch_size=100,collate=custom_collate)
  test_loss, test_acc = test(model,dataset_test,batch_size=100,collate=custom_collate)

  # Formatting results to plot
  fixed_xavier_data_to_plot_loss.append({
    "x": [x+1 for x in range(10)],
    "y": loss,
    "name": "Fixed-Xavier SGD + momentum loss"
  })

  fixed_xavier_data_to_plot_acc.append({
    "x": [x+1 for x in range(10)],
    "y": acc,
    "name": "Fixed-Xavier SGD + momentum Acc"
  })


Setting attributes on ParameterList is not supported.



Epoch: 1 | Acc: 65.56% | loss: 1.495
Epoch: 2 | Acc: 86.94% | loss: 0.499
Epoch: 3 | Acc: 89.64% | loss: 0.369
Epoch: 4 | Acc: 90.81% | loss: 0.321
Epoch: 5 | Acc: 91.54% | loss: 0.291
Epoch: 6 | Acc: 92.24% | loss: 0.268
Epoch: 7 | Acc: 92.83% | loss: 0.249
Epoch: 8 | Acc: 93.35% | loss: 0.233
Epoch: 9 | Acc: 93.78% | loss: 0.218
Epoch: 10 | Acc: 94.14% | loss: 0.204

Test Results: Acc: 94.42% | loss: 0.195


In [None]:
# RMSProp test
model = FFNN(784,[512,1024,128],[relu,celu,swish, softmax],10,init_method=xavier_init)
model.to('cuda')
optimizer = RMSProp(model.parameters(),lr=1e-3)
with torch.no_grad():
  loss, acc = train(model, dataset_train, optimizer,epochs=10,batch_size=100,collate=custom_collate)
  test_loss, test_acc = test(model,dataset_test,batch_size=100,collate=custom_collate)

  # Formatting results to plot
  fixed_xavier_data_to_plot_loss.append({
    "x": [x+1 for x in range(10)],
    "y": loss,
    "name": "Fixed-Xavier RMSProp loss"
  })

  fixed_xavier_data_to_plot_acc.append({
    "x": [x+1 for x in range(10)],
    "y": acc,
    "name": "Fixed-Xavier RMSProp Acc"
  })


Setting attributes on ParameterList is not supported.



Epoch: 1 | Acc: 92.89% | loss: 0.226
Epoch: 2 | Acc: 97.16% | loss: 0.094
Epoch: 3 | Acc: 97.97% | loss: 0.070
Epoch: 4 | Acc: 98.42% | loss: 0.054
Epoch: 5 | Acc: 98.64% | loss: 0.047
Epoch: 6 | Acc: 98.85% | loss: 0.041
Epoch: 7 | Acc: 99.02% | loss: 0.036
Epoch: 8 | Acc: 99.13% | loss: 0.033
Epoch: 9 | Acc: 99.19% | loss: 0.030
Epoch: 10 | Acc: 99.28% | loss: 0.028

Test Results: Acc: 97.53% | loss: 0.145


In [None]:
# Adam test
model = FFNN(784,[512,1024,128],[relu,celu,swish, softmax],10,init_method=xavier_init)
model.to('cuda')
optimizer = Adam(model.parameters(),lr=1e-3)
with torch.no_grad():
  loss, acc = train(model, dataset_train, optimizer,epochs=10,batch_size=100,collate=custom_collate)
  test_loss, test_acc = test(model,dataset_test,batch_size=100,collate=custom_collate)

  # Formatting results to plot
  fixed_xavier_data_to_plot_loss.append({
    "x": [x+1 for x in range(10)],
    "y": loss,
    "name": "Fixed-Xavier Adam loss"
  })

  fixed_xavier_data_to_plot_acc.append({
    "x": [x+1 for x in range(10)],
    "y": acc,
    "name": "Fixed-Xavier Adam Acc"
  })


Setting attributes on ParameterList is not supported.



Epoch: 1 | Acc: 93.20% | loss: 0.226
Epoch: 2 | Acc: 97.52% | loss: 0.083
Epoch: 3 | Acc: 98.22% | loss: 0.057
Epoch: 4 | Acc: 98.63% | loss: 0.042
Epoch: 5 | Acc: 98.90% | loss: 0.034
Epoch: 6 | Acc: 99.05% | loss: 0.031
Epoch: 7 | Acc: 99.15% | loss: 0.027
Epoch: 8 | Acc: 99.22% | loss: 0.024
Epoch: 9 | Acc: 99.22% | loss: 0.025
Epoch: 10 | Acc: 99.35% | loss: 0.020

Test Results: Acc: 98.01% | loss: 0.099


In [None]:
plot_results(fixed_xavier_data_to_plot_loss, fixed_xavier_data_to_plot_acc)





### Using Fixed **He** initialization:

In [None]:
fixed_he_data_to_plot_loss = []
fixed_he_data_to_plot_acc = []

In [None]:
# SGD + momentum test
model = FFNN(784,[512,1024,128],[relu,celu,swish, softmax],10,init_method=he_init)
model.to('cuda')
optimizer = SGD(model.parameters(),lr=1e-3,momentum=0.9)
with torch.no_grad():
  loss, acc = train(model, dataset_train, optimizer,epochs=10,batch_size=100,collate=custom_collate)
  test_loss, test_acc = test(model,dataset_test,batch_size=100,collate=custom_collate)

  # Formatting results to plot
  fixed_he_data_to_plot_loss.append({
    "x": [x+1 for x in range(10)],
    "y": loss,
    "name": "Fixed-He SGD + momentum loss"
  })

  fixed_he_data_to_plot_acc.append({
    "x": [x+1 for x in range(10)],
    "y": acc,
    "name": "Fixed-He SGD + momentum Acc"
  })


Setting attributes on ParameterList is not supported.



Epoch: 1 | Acc: 78.87% | loss: 0.814
Epoch: 2 | Acc: 90.06% | loss: 0.351
Epoch: 3 | Acc: 91.56% | loss: 0.293
Epoch: 4 | Acc: 92.52% | loss: 0.260
Epoch: 5 | Acc: 93.28% | loss: 0.236
Epoch: 6 | Acc: 93.77% | loss: 0.216
Epoch: 7 | Acc: 94.25% | loss: 0.200
Epoch: 8 | Acc: 94.66% | loss: 0.186
Epoch: 9 | Acc: 95.03% | loss: 0.174
Epoch: 10 | Acc: 95.31% | loss: 0.163

Test Results: Acc: 95.41% | loss: 0.161


In [None]:
# RMSProp test
model = FFNN(784,[512,1024,128],[relu,celu,swish, softmax],10,init_method=he_init)
model.to('cuda')
optimizer = RMSProp(model.parameters(),lr=1e-3)
with torch.no_grad():
  loss, acc = train(model, dataset_train, optimizer,epochs=10,batch_size=100,collate=custom_collate)
  test_loss, test_acc = test(model,dataset_test,batch_size=100,collate=custom_collate)

  # Formatting results to plot
  fixed_he_data_to_plot_loss.append({
    "x": [x+1 for x in range(10)],
    "y": loss,
    "name": "Fixed-He RMSProp loss"
  })

  fixed_he_data_to_plot_acc.append({
    "x": [x+1 for x in range(10)],
    "y": acc,
    "name": "Fixed-He RMSProp Acc"
  })


Setting attributes on ParameterList is not supported.



Epoch: 1 | Acc: 92.84% | loss: 0.229
Epoch: 2 | Acc: 97.21% | loss: 0.094
Epoch: 3 | Acc: 97.95% | loss: 0.069
Epoch: 4 | Acc: 98.35% | loss: 0.057
Epoch: 5 | Acc: 98.65% | loss: 0.049
Epoch: 6 | Acc: 98.83% | loss: 0.044
Epoch: 7 | Acc: 98.96% | loss: 0.041
Epoch: 8 | Acc: 99.01% | loss: 0.036
Epoch: 9 | Acc: 99.11% | loss: 0.034
Epoch: 10 | Acc: 99.28% | loss: 0.030

Test Results: Acc: 98.00% | loss: 0.112


In [None]:
# Adam test
model = FFNN(784,[512,1024,128],[relu,celu,swish, softmax],10,init_method=he_init)
model.to('cuda')
optimizer = Adam(model.parameters(),lr=1e-3)
with torch.no_grad():
  loss, acc = train(model, dataset_train, optimizer,epochs=10,batch_size=100,collate=custom_collate)
  test_loss, test_acc = test(model,dataset_test,batch_size=100,collate=custom_collate)

  # Formatting results to plot
  fixed_he_data_to_plot_loss.append({
    "x": [x+1 for x in range(10)],
    "y": loss,
    "name": "Fixed-He Adam loss"
  })

  fixed_he_data_to_plot_acc.append({
    "x": [x+1 for x in range(10)],
    "y": acc,
    "name": "Fixed-He Adam Acc"
  })


Setting attributes on ParameterList is not supported.



Epoch: 1 | Acc: 93.59% | loss: 0.211
Epoch: 2 | Acc: 97.47% | loss: 0.081
Epoch: 3 | Acc: 98.27% | loss: 0.054
Epoch: 4 | Acc: 98.62% | loss: 0.042
Epoch: 5 | Acc: 98.90% | loss: 0.033
Epoch: 6 | Acc: 98.94% | loss: 0.032
Epoch: 7 | Acc: 99.19% | loss: 0.026
Epoch: 8 | Acc: 99.19% | loss: 0.026
Epoch: 9 | Acc: 99.30% | loss: 0.022
Epoch: 10 | Acc: 99.32% | loss: 0.022

Test Results: Acc: 97.81% | loss: 0.094


In [None]:
plot_results(fixed_he_data_to_plot_loss, fixed_he_data_to_plot_acc)





### Using non-fixed Heuristic initialization:

In [None]:
heuristic_data_to_plot_loss = []
heuristic_data_to_plot_acc = []

In [None]:
# SGD + momentum test

# It is important to mention again that not passing an specific init_method
# will result in applying them in an heuristic (inteligent way) based on the activation function

model = FFNN(784,[512,1024,128],[relu,celu,swish, softmax],10)
model.to('cuda')
optimizer = SGD(model.parameters(),lr=1e-3,momentum=0.9)
with torch.no_grad():
  loss, acc = train(model, dataset_train, optimizer,epochs=10,batch_size=100,collate=custom_collate)
  test_loss, test_acc = test(model,dataset_test,batch_size=100,collate=custom_collate)

  # Formatting results to plot
  heuristic_data_to_plot_loss.append({
    "x": [x+1 for x in range(10)],
    "y": loss,
    "name": "Heuristic SGD + momentum loss"
  })

  heuristic_data_to_plot_acc.append({
    "x": [x+1 for x in range(10)],
    "y": acc,
    "name": "Heuristic SGD + momentum Acc"
  })


Setting attributes on ParameterList is not supported.



Epoch: 1 | Acc: 77.96% | loss: 0.823
Epoch: 2 | Acc: 89.77% | loss: 0.362
Epoch: 3 | Acc: 91.23% | loss: 0.303
Epoch: 4 | Acc: 92.26% | loss: 0.270
Epoch: 5 | Acc: 92.96% | loss: 0.246
Epoch: 6 | Acc: 93.45% | loss: 0.227
Epoch: 7 | Acc: 93.95% | loss: 0.211
Epoch: 8 | Acc: 94.37% | loss: 0.196
Epoch: 9 | Acc: 94.74% | loss: 0.184
Epoch: 10 | Acc: 95.05% | loss: 0.172

Test Results: Acc: 95.06% | loss: 0.170


In [None]:
# RMSProp test
model = FFNN(784,[512,1024,128],[relu,celu,swish, softmax],10)
model.to('cuda')
optimizer = RMSProp(model.parameters(),lr=1e-3)
with torch.no_grad():
  loss, acc = train(model, dataset_train, optimizer,epochs=10,batch_size=100,collate=custom_collate)
  test_loss, test_acc = test(model,dataset_test,batch_size=100,collate=custom_collate)

  # Formatting results to plot
  heuristic_data_to_plot_loss.append({
    "x": [x+1 for x in range(10)],
    "y": loss,
    "name": "Heuristic RMSProp loss"
  })

  heuristic_data_to_plot_acc.append({
    "x": [x+1 for x in range(10)],
    "y": acc,
    "name": "Heuristic RMSProp Acc"
  })


Setting attributes on ParameterList is not supported.



Epoch: 1 | Acc: 93.03% | loss: 0.229
Epoch: 2 | Acc: 97.26% | loss: 0.095
Epoch: 3 | Acc: 97.94% | loss: 0.070
Epoch: 4 | Acc: 98.38% | loss: 0.058
Epoch: 5 | Acc: 98.62% | loss: 0.049
Epoch: 6 | Acc: 98.82% | loss: 0.040
Epoch: 7 | Acc: 99.02% | loss: 0.037
Epoch: 8 | Acc: 99.11% | loss: 0.034
Epoch: 9 | Acc: 99.23% | loss: 0.032
Epoch: 10 | Acc: 99.24% | loss: 0.031

Test Results: Acc: 97.84% | loss: 0.131


In [None]:
# Adam test
model = FFNN(784,[512,1024,128],[relu,celu,swish, softmax],10)
model.to('cuda')
optimizer = Adam(model.parameters(),lr=1e-3)
with torch.no_grad():
  loss, acc = train(model, dataset_train, optimizer,epochs=10,batch_size=100,collate=custom_collate)
  test_loss, test_acc = test(model,dataset_test,batch_size=100,collate=custom_collate)

  # Formatting results to plot
  heuristic_data_to_plot_loss.append({
    "x": [x+1 for x in range(10)],
    "y": loss,
    "name": "Heuristic Adam loss"
  })

  heuristic_data_to_plot_acc.append({
    "x": [x+1 for x in range(10)],
    "y": acc,
    "name": "Heuristic Adam Acc"
  })


Setting attributes on ParameterList is not supported.



Epoch: 1 | Acc: 93.55% | loss: 0.212
Epoch: 2 | Acc: 97.50% | loss: 0.080
Epoch: 3 | Acc: 98.29% | loss: 0.056
Epoch: 4 | Acc: 98.76% | loss: 0.038
Epoch: 5 | Acc: 98.89% | loss: 0.035
Epoch: 6 | Acc: 99.02% | loss: 0.031
Epoch: 7 | Acc: 99.01% | loss: 0.031
Epoch: 8 | Acc: 99.26% | loss: 0.022
Epoch: 9 | Acc: 99.25% | loss: 0.024
Epoch: 10 | Acc: 99.40% | loss: 0.020

Test Results: Acc: 97.68% | loss: 0.104


In [None]:
plot_results(heuristic_data_to_plot_loss, heuristic_data_to_plot_acc)





### Using the previous approach (torch.randn) initialization:

In [None]:
randn_data_to_plot_loss = []
randn_data_to_plot_acc = []

In [None]:
# SGD + momentum test

model = FFNN(784,[512,1024,128],[relu,celu,swish, softmax],10,init_method=torch.randn)
model.to('cuda')
optimizer = SGD(model.parameters(),lr=1e-3,momentum=0.9)
with torch.no_grad():
  loss, acc = train(model, dataset_train, optimizer,epochs=10,batch_size=100,collate=custom_collate)
  test_loss, test_acc = test(model,dataset_test,batch_size=100,collate=custom_collate)

  # Formatting results to plot
  randn_data_to_plot_loss.append({
    "x": [x+1 for x in range(10)],
    "y": loss,
    "name": "Torch.randn SGD + momentum loss"
  })

  randn_data_to_plot_acc.append({
    "x": [x+1 for x in range(10)],
    "y": acc,
    "name": "Torch.randn SGD + momentum Acc"
  })


Setting attributes on ParameterList is not supported.



Epoch: 1 | Acc: 11.37% | loss: 2.484
Epoch: 2 | Acc: 11.25% | loss: 2.303
Epoch: 3 | Acc: 11.24% | loss: 2.302
Epoch: 4 | Acc: 11.24% | loss: 2.302
Epoch: 5 | Acc: 11.24% | loss: 2.302
Epoch: 6 | Acc: 11.24% | loss: 2.302
Epoch: 7 | Acc: 11.24% | loss: 2.301
Epoch: 8 | Acc: 11.24% | loss: 2.301
Epoch: 9 | Acc: 11.24% | loss: 2.301
Epoch: 10 | Acc: 11.24% | loss: 2.301

Test Results: Acc: 11.35% | loss: 2.301


In [46]:
# RMSProp test
model = FFNN(784,[512,1024,128],[relu,celu,swish, softmax],10,init_method=torch.randn)
model.to('cuda')
optimizer = RMSProp(model.parameters(),lr=1e-3)
with torch.no_grad():
  loss, acc = train(model, dataset_train, optimizer,epochs=10,batch_size=100,collate=custom_collate)
  test_loss, test_acc = test(model,dataset_test,batch_size=100,collate=custom_collate)

  # Formatting results to plot
  randn_data_to_plot_loss.append({
    "x": [x+1 for x in range(10)],
    "y": loss,
    "name": "Torch.randn RMSProp loss"
  })

  randn_data_to_plot_acc.append({
    "x": [x+1 for x in range(10)],
    "y": acc,
    "name": "Torch.randn RMSProp Acc"
  })


Setting attributes on ParameterList is not supported.



Epoch: 1 | Acc: 66.65% | loss: 6.138
Epoch: 2 | Acc: 84.05% | loss: 2.934
Epoch: 3 | Acc: 87.01% | loss: 2.390
Epoch: 4 | Acc: 88.81% | loss: 2.058
Epoch: 5 | Acc: 89.74% | loss: 1.884
Epoch: 6 | Acc: 90.67% | loss: 1.715
Epoch: 7 | Acc: 91.26% | loss: 1.606
Epoch: 8 | Acc: 92.08% | loss: 1.456
Epoch: 9 | Acc: 92.35% | loss: 1.407
Epoch: 10 | Acc: 92.90% | loss: 1.303

Test Results: Acc: 92.41% | loss: 1.392


In [None]:
# Adam test
model = FFNN(784,[512,1024,128],[relu,celu,swish, softmax],10,init_method=torch.randn)
model.to('cuda')
optimizer = Adam(model.parameters(),lr=1e-3)
with torch.no_grad():
  loss, acc = train(model, dataset_train, optimizer,epochs=10,batch_size=100,collate=custom_collate)
  test_loss, test_acc = test(model,dataset_test,batch_size=100,collate=custom_collate)

  # Formatting results to plot
  randn_data_to_plot_loss.append({
    "x": [x+1 for x in range(10)],
    "y": loss,
    "name": "Torch.randn Adam loss"
  })

  randn_data_to_plot_acc.append({
    "x": [x+1 for x in range(10)],
    "y": acc,
    "name": "Torch.randn Adam Acc"
  })


Setting attributes on ParameterList is not supported.



Epoch: 1 | Acc: 74.73% | loss: 4.653
Epoch: 2 | Acc: 86.80% | loss: 2.426
Epoch: 3 | Acc: 88.72% | loss: 2.075


In [None]:
plot_results(randn_data_to_plot_loss, randn_data_to_plot_acc)

### Using Fixed **Xavier-uniform** variant

In [None]:
uniform_xavier_data_to_plot_loss = []
uniform_xavier_data_to_plot_acc = []

In [None]:
# SGD + momentum test
model = FFNN(784,[512,1024,128],[relu,celu,swish, softmax],10,init_method=xavier_init_uniform)
model.to('cuda')
optimizer = SGD(model.parameters(),lr=1e-3,momentum=0.9)
with torch.no_grad():
  loss, acc = train(model, dataset_train, optimizer,epochs=10,batch_size=100,collate=custom_collate)
  test_loss, test_acc = test(model,dataset_test,batch_size=100,collate=custom_collate)

  # Formatting results to plot
  uniform_xavier_data_to_plot_loss.append({
    "x": [x+1 for x in range(10)],
    "y": loss,
    "name": "Uniform-Xavier SGD + momentum loss"
  })

  uniform_xavier_data_to_plot_acc.append({
    "x": [x+1 for x in range(10)],
    "y": acc,
    "name": "Uniform-Xavier SGD + momentum Acc"
  })

In [None]:
# RMSProp test
model = FFNN(784,[512,1024,128],[relu,celu,swish, softmax],10,init_method=xavier_init_uniform)
model.to('cuda')
optimizer = RMSProp(model.parameters(),lr=1e-3)
with torch.no_grad():
  loss, acc = train(model, dataset_train, optimizer,epochs=10,batch_size=100,collate=custom_collate)
  test_loss, test_acc = test(model,dataset_test,batch_size=100,collate=custom_collate)

  # Formatting results to plot
  uniform_xavier_data_to_plot_loss.append({
    "x": [x+1 for x in range(10)],
    "y": loss,
    "name": "Uniform-Xavier RMSProp loss"
  })

  uniform_xavier_data_to_plot_acc.append({
    "x": [x+1 for x in range(10)],
    "y": acc,
    "name": "Uniform-Xavier RMSProp Acc"
  })

In [None]:
# Adam test
model = FFNN(784,[512,1024,128],[relu,celu,swish, softmax],10,init_method=xavier_init_uniform)
model.to('cuda')
optimizer = Adam(model.parameters(),lr=1e-3)
with torch.no_grad():
  loss, acc = train(model, dataset_train, optimizer,epochs=10,batch_size=100,collate=custom_collate)
  test_loss, test_acc = test(model,dataset_test,batch_size=100,collate=custom_collate)

  # Formatting results to plot
  uniform_xavier_data_to_plot_loss.append({
    "x": [x+1 for x in range(10)],
    "y": loss,
    "name": "Uniform-Xavier Adam loss"
  })

  uniform_xavier_data_to_plot_acc.append({
    "x": [x+1 for x in range(10)],
    "y": acc,
    "name": "Uniform-Xavier Adam Acc"
  })

In [None]:
plot_results(uniform_xavier_data_to_plot_loss, uniform_xavier_data_to_plot_acc)

### Using Fixed **He-uniform** variant

In [None]:
uniform_he_data_to_plot_loss = []
uniform_he_data_to_plot_acc = []

In [None]:
# SGD + momentum test
model = FFNN(784,[512,1024,128],[relu,celu,swish, softmax],10,init_method=he_init_uniform)
model.to('cuda')
optimizer = SGD(model.parameters(),lr=1e-3,momentum=0.9)
with torch.no_grad():
  loss, acc = train(model, dataset_train, optimizer,epochs=10,batch_size=100,collate=custom_collate)
  test_loss, test_acc = test(model,dataset_test,batch_size=100,collate=custom_collate)

  # Formatting results to plot
  uniform_he_data_to_plot_loss.append({
    "x": [x+1 for x in range(10)],
    "y": loss,
    "name": "Uniform-He SGD + momentum loss"
  })

  uniform_he_data_to_plot_acc.append({
    "x": [x+1 for x in range(10)],
    "y": acc,
    "name": "Uniform-He SGD + momentum Acc"
  })

In [None]:
# RMSProp test
model = FFNN(784,[512,1024,128],[relu,celu,swish, softmax],10,init_method=he_init_uniform)
model.to('cuda')
optimizer = RMSProp(model.parameters(),lr=1e-3)
with torch.no_grad():
  loss, acc = train(model, dataset_train, optimizer,epochs=10,batch_size=100,collate=custom_collate)
  test_loss, test_acc = test(model,dataset_test,batch_size=100,collate=custom_collate)

  # Formatting results to plot
  uniform_he_data_to_plot_loss.append({
    "x": [x+1 for x in range(10)],
    "y": loss,
    "name": "Uniform-He RMSProp loss"
  })

  uniform_he_data_to_plot_acc.append({
    "x": [x+1 for x in range(10)],
    "y": acc,
    "name": "Uniform-He RMSProp Acc"
  })

In [None]:
# Adam test
model = FFNN(784,[512,1024,128],[relu,celu,swish, softmax],10,init_method=he_init_uniform)
model.to('cuda')
optimizer = Adam(model.parameters(),lr=1e-3)
with torch.no_grad():
  loss, acc = train(model, dataset_train, optimizer,epochs=10,batch_size=100,collate=custom_collate)
  test_loss, test_acc = test(model,dataset_test,batch_size=100,collate=custom_collate)

  # Formatting results to plot
  uniform_he_data_to_plot_loss.append({
    "x": [x+1 for x in range(10)],
    "y": loss,
    "name": "Uniform-He Adam loss"
  })

  uniform_he_data_to_plot_acc.append({
    "x": [x+1 for x in range(10)],
    "y": acc,
    "name": "Uniform-He Adam Acc"
  })

In [None]:
plot_results(uniform_he_data_to_plot_loss, uniform_he_data_to_plot_acc)

## Comments & results ... and more charts
After the previous set of tests there are a lot of facts we can observe and discuss about. It is clear the importance of a good initialization schema for the parameters of a neural network model, they are not only related the initial naive thought of which activation function we will be using on a layer but also to the optimization algorithm we want to use and the learn rate that will be applied in it.

These methods can cause that the same algorithm with the same learn rate start `trapped` in place of the loss function where it can maybe not escape and in consecuence not advance to our goal (learning).

A different discover which is imporant to mention is that using same learn rate for the different algorithms could not be good at all, we set them as fixed  and the same value for this specific set of tests to be able to be consistent across them, but in reality switching optimizer algorithm amerit to double check the learn rate we use.



### Same results but grouped by algorithm

#### SGD + momentum

In [None]:
sgd_loss_data = [
  fixed_xavier_data_to_plot_loss[0],
  fixed_he_data_to_plot_loss[0],
  heuristic_data_to_plot_loss[0],
  randn_data_to_plot_loss[0],
  uniform_xavier_data_to_plot_loss[0],
  uniform_he_data_to_plot_loss[0]
]

sgd_acc_data = [
  fixed_xavier_data_to_plot_acc[0],
  fixed_he_data_to_plot_acc[0],
  heuristic_data_to_plot_acc[0],
  randn_data_to_plot_acc[0],
  uniform_xavier_data_to_plot_acc[0],
  uniform_he_data_to_plot_acc[0]
]

plot_results(sgd_loss_data, sgd_acc_data)

#### RMSProp

In [None]:
rmsprop_loss_data = [
  fixed_xavier_data_to_plot_loss[1],
  fixed_he_data_to_plot_loss[1],
  heuristic_data_to_plot_loss[1],
  randn_data_to_plot_loss[1],
  uniform_xavier_data_to_plot_loss[1],
  uniform_he_data_to_plot_loss[1]
]

rmsprop_acc_data = [
  fixed_xavier_data_to_plot_acc[1],
  fixed_he_data_to_plot_acc[1],
  heuristic_data_to_plot_acc[1],
  randn_data_to_plot_acc[1],
  uniform_xavier_data_to_plot_acc[1],
  uniform_he_data_to_plot_acc[1]
]

plot_results(rmsprop_loss_data, rmsprop_acc_data)

#### Adam

In [None]:
adam_loss_data = [
  fixed_xavier_data_to_plot_loss[2],
  fixed_he_data_to_plot_loss[2],
  heuristic_data_to_plot_loss[2],
  randn_data_to_plot_loss[2],
  uniform_xavier_data_to_plot_loss[2],
  uniform_he_data_to_plot_loss[2]
]

adam_acc_data = [
  fixed_xavier_data_to_plot_acc[2],
  fixed_he_data_to_plot_acc[2],
  heuristic_data_to_plot_acc[2],
  randn_data_to_plot_acc[2],
  uniform_xavier_data_to_plot_acc[2],
  uniform_he_data_to_plot_acc[2]
]

plot_results(adam_loss_data, adam_acc_data)

### Note
In the new grouped version of our charts we can appreciate that the `heuristic` initialization is giving the best results at least for the network structure and optimization algorithms we used.

## Final Thoughts and future work

Across this work we explored the different strategies for the setup and fine optimnization of a neural netwrok model, all this steps should be taken in consideration every time a new problem is faced as we learned even a small modification can cause a potentially powerful network structure to not work at all.

In terms of possible future investigation, it is possible to explore the different variations of `Adam` algorithms and `batch normalization` techniques.