In [2]:
import torch
import torchvision
import datasets, networks, sampling
# We use matplotlib to plot the loss curve.
import matplotlib.pyplot as plt
# Use shorthands (nn instead of torch.nn, optim instead of torch.optim)
from torch import nn, optim

In [3]:
# The following settings are called "hyperparameters", which need to be selected
# by you (a human), usually through trial and error.

# One epoch means seeing every image of the training dataset, which consists of 60,000 images.
# If the number is too low, the network will not be able to learn well.
# Seeing 5 times each image should be enough to reach >96% accuracy. (We get ~98%).
# Don`t change this value, we will use 5 epochs.
num_epochs = 5 # don't change this value

# The batch size is the number of images per update of the network.
##############################
##############################
# TODO Select a batch size.
batch_size = 32
##############################
##############################

# The learning rate determines how drastically the parameters of the network
# change. This value needs to be selected carefully. 
##############################
##############################
# TODO Select a learning rate.
lr= 0.0001
##############################
##############################

num_units=500
num_layer=5


# Create "data loaders" for training and testing with the batch size from above.
# They can do things like multiprocessing and shuffling the order of the images.
# We can iterate over them to obtain batches of images and labels
# (see training loop below).
##############################
# TODO Write your code here.
Ds = datasets.Dataset('mnist', batch_size=batch_size)
training_data = Ds.get_train_data_loader()
test_data = Ds.get_test_data_loader()


ordering=range(1,785)

##############################

# Create the neural network and its optimizer.
##############################
##############################
# TODO Write your code here.
MADE = networks.MADE(num_layer=num_layer, num_units=num_units, ordering=ordering, input_feat=28*28)
##############################
##############################

# Select the device that will be used for training.
# This code selects a GPU, if it is available, otherwise the CPU.
if torch.cuda.is_available():
  device = torch.device('cuda:0')
else:
  device = torch.device('cpu')
print(f'Using device: {device}')
print('=========================================')

MADE.to(device)  # Put the neural network on the selected device. GPU or CPU.

# We want to plot the loss and accuracy curve at the end of training.
loss_curve = []
accuracy_curve = []
epoch_markers = []
optimizer= torch.optim.Adam(MADE.parameters(),lr)

for epoch in range(num_epochs):
  # Start a new epoch and iterate over all images in the training dataset.

  # Switch to training mode.
  MADE.train()

  # We want to print the average loss now and then during the epoch.
  losses = []
  batch_idx = 0

  for images, labels in training_data:
    # images and labels are tensors.
    # Move the tensors to the device.
    images = images.to(device)
    labels = labels.to(device)

    # At this point, you need to implement the following steps:
    # 1. Compute the outputs of our neural network (don't use predict()!).
    # 2. Compute the cross-entropy loss from nn.functional.
    # 3. Compute the gradients of the loss function with respect to all parameters.
    # 4. Update all parameters using the optimizer.
    # 5. Reset the gradients, we don't want them anymore (important!).
    ##############################
    ##############################
    # TODO Write your code here.

    output=MADE.forward(images)
    loss = nn.functional.cross_entropy(output,labels)
    loss.backward()
    optimizer.step()
    optimizer.zero_grad()

    ##############################
    ##############################

    # Save the loss value.
    losses.append(loss.detach().clone())
    # Print the average loss now and then.
    if batch_idx % 100 == 0:
      average_loss = torch.stack(losses).mean().item()
      loss_curve.append(average_loss)
      losses = []
      print(f'Epoch: {epoch + 1:3d}/{num_epochs:3d}, Batch {batch_idx + 1:5d}, Loss: {average_loss:.4f}')
    batch_idx += 1




Using device: cuda:0
Epoch:   1/  5, Batch     1, Loss: 6.7170
Epoch:   1/  5, Batch   101, Loss: 6.2821
Epoch:   1/  5, Batch   201, Loss: 5.5087
Epoch:   1/  5, Batch   301, Loss: 4.8734
Epoch:   1/  5, Batch   401, Loss: 4.3751
Epoch:   1/  5, Batch   501, Loss: 3.9643
Epoch:   1/  5, Batch   601, Loss: 3.6550
Epoch:   1/  5, Batch   701, Loss: 3.3961


KeyboardInterrupt: 