In [1]:
import importlib

import numpy as np
import pandas as pd
import torch
from torch import optim

from assignment import *

%load_ext autoreload
%autoreload 2


# Advanced Deep Learning ─ Assignment 1

## Question 1

Try to load the same data directly from the "MINST database" website http://yann.lecun.com/exdb/mnist/. Be careful that the images can have a different normalization and encoding


In [2]:
load_data_torch()
# Set data sets
X_train = load_data_ylc(
  file_name="train-images-idx3-ubyte.gz",
  is_image=True,
  nb_images=60000,
)
y_train = load_data_ylc(
  file_name="train-labels-idx1-ubyte.gz",
  is_image=False,
  nb_images=60000,
  normalize=False,
)
X_test = load_data_ylc(
  file_name="t10k-images-idx3-ubyte.gz",
  is_image=True,
  nb_images=10000,
)
y_test = load_data_ylc(
  file_name="t10k-labels-idx1-ubyte.gz",
  is_image=False,
  nb_images=10000,
  normalize=False,
)
# Transform labels to one_hot encoding
y_train_one_hot = torch.nn.functional.one_hot(
  y_train.to(torch.int64), num_classes=10
).float()
y_test_one_hot = torch.nn.functional.one_hot(
  y_test.to(torch.int64), num_classes=10
).float()


### Q2

Using the utilities in plt and numpy display some images and check that the corresponding labels are consistent.


In [3]:
fig = display_digits(X_train=X_train, y_train=y_train)
fig.show()
fig.write_image("data/labels.png")


### Q3

Complete the code below so to have a MLP with one hidden layer with 300 neurons. \
Remember that we want one-hot outputs.


In [4]:
# Let us define the neural network we are using

hidden_sizes = [300]
net = define_net(hidden_sizes=hidden_sizes)
lr = 0.01
# Now we define the optimizer and the loss function
loss = torch.nn.CrossEntropyLoss()
optimizer = optim.SGD(net.parameters(), lr=lr)

# Initialize arrays to track errors
# The test error array is there for informative purposes.
# We do not use it when updating weights.
# In a real world scenario, we shoudln't even look at it to choose when to (early-) stop training.
error_train = []
error_test = []

inputs = torch.flatten(X_train, start_dim=1, end_dim=2)
labels = y_train_one_hot


print(sum([p.numel() for p in net.parameters()]))


device = torch.device("cuda")
X_train = X_train.to(device)
X_test = X_test.to(device)
y_train = y_train.to(device)
y_test = y_test.to(device)
net = net.to(device)
inputs = inputs.to(device)
labels = labels.to(device)
y_test_one_hot = y_test_one_hot.to(device)


238510


### Q4

Complete the code below to perform a GD based optimization


In [5]:
for k in range(2000):
  optimizer.zero_grad()

  outputs = net(inputs)
  outputs = outputs.to(device)

  # Define the empirical risk
  risk = loss(outputs, labels)

  # Make the backward step (1 line instruction)
  risk.backward()

  # Update the parameters (1 line instruction)
  optimizer.step()

  with torch.no_grad():
    y_pred_one_hot = net(torch.flatten(X_test, start_dim=1, end_dim=2))
    prediction_loss = loss(y_pred_one_hot, y_test_one_hot)

    error_train.append(risk.item())
    error_test.append(prediction_loss.item())

    print(
      f"k = {k}, \tRisk = {risk.item()}, \tPrediction loss = {prediction_loss.item()}"
    )



k = 0, 	Risk = 2.347132921218872, 	Prediction loss = 2.3408586978912354
k = 1, 	Risk = 2.3397789001464844, 	Prediction loss = 2.334556818008423
k = 2, 	Risk = 2.333519220352173, 	Prediction loss = 2.3291683197021484
k = 3, 	Risk = 2.328174352645874, 	Prediction loss = 2.3245463371276855
k = 4, 	Risk = 2.3235952854156494, 	Prediction loss = 2.3205695152282715
k = 5, 	Risk = 2.3196613788604736, 	Prediction loss = 2.317136764526367
k = 6, 	Risk = 2.3162713050842285, 	Prediction loss = 2.314164876937866
k = 7, 	Risk = 2.313340902328491, 	Prediction loss = 2.3115828037261963
k = 8, 	Risk = 2.3107993602752686, 	Prediction loss = 2.3093316555023193
k = 9, 	Risk = 2.3085875511169434, 	Prediction loss = 2.3073620796203613
k = 10, 	Risk = 2.3066556453704834, 	Prediction loss = 2.3056318759918213
k = 11, 	Risk = 2.304962158203125, 	Prediction loss = 2.304105520248413
k = 12, 	Risk = 2.303471088409424, 	Prediction loss = 2.30275297164917
k = 13, 	Risk = 2.302152633666992, 	Prediction loss = 2.3015

In [6]:

df_results = pd.DataFrame({"train_error": error_train, "test_error": error_test})


In [7]:
fig = plot_errors(df_results=df_results, hidden_sizes=hidden_sizes, lr=0.001)
fig.show()

# Write image with logarithmic scale
fig = plot_errors(
  df_results=df_results, hidden_sizes=hidden_sizes, log_y=True, lr=0.001
)
fig.show()


Cross-entropy loss (no logarithmic scale)


Cross-entropy loss (logarithmic scale)


### Q5
Compute the final accuracy on test set


In [8]:
y_pred_one_hot = net(torch.flatten(X_test, start_dim=1, end_dim=2))
y_pred = torch.argmax(input=y_pred_one_hot, dim=1)
acc = (y_test == y_pred).sum() / len(y_test)
print("Final accuracy on test", float(acc))


Final accuracy on test 0.7863999605178833
