<a href="https://colab.research.google.com/github/jimbounce/BDSML_HousePrice/blob/main/njc_house.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Coursework II -- Training hyperparameters

The goal of the coursework is to modify a simple bit of numpy code that trains a network and measures the performance on a validation set for the MNist 1D dataset.

In this coursework, you need to modify the **training hyperparameters** (only) to improve the performance over the current attempt. This could mean the training algorithm, learning rate, learning rate schedule, momentum term, initialization etc.  

You must improve the performance by at least 2% to get full marks.

You will need to upload three things to Moodle:
1.   The image that this notebook saves (click the folder icon on the left on colab to download it)
2.   The lines of code you changed
3.   The whole notebook as a .ipynb file.  You can do this on the File menu

In [None]:
import numpy as np
import os
import torch, torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader
from torch.optim.lr_scheduler import StepLR
import matplotlib.pyplot as plt
import random
import gdown

In [None]:
# train: https://drive.google.com/file/d/10dPJKqwVWp_-VXath28y0Vy58biCm9sl/view?usp=drive_link
# test: https://drive.google.com/file/d/1v7wvkUHj3jm3v40N0u3d340dMKaLamyH/view?usp=drive_link

# Run this once to copy the train and validation data to your CoLab environment
# !gdown 10dPJKqwVWp_-VXath28y0Vy58biCm9sl
# !gdown 1v7wvkUHj3jm3v40N0u3d340dMKaLamyH


# Load CSV file
csv_train = np.genfromtxt('/content/house_train.csv', delimiter=',')
csv_test = np.genfromtxt('/content/house_test.csv', delimiter=',')
# Convert CSV data to NPY array
np.save('/content/house_train.npy', csv_train)
np.save('/content/house_test.npy', csv_test)

# Delete csv's
np.delete('/content/house_train.csv')
np.delete('/content/house_test.csv')

# DL course version (not needed)
# if not os.path.exists('./Data.zip'):
#   !gdown 1HtnCrncY6dFCYqzgPf1HtPVAerTpwFRm
#   !unzip Data.zip

# New section

In [14]:
# Load the training .npy file
train_data = np.load('/content/house_train.npy')

# Split into X and y
tr_X = train_data[1:, 1:-1]  # not first row (b4 comma), Assumes the last column contains the labels
tr_y = train_data[1:, -1]   # Assumes the last column contains the labels
# Save X and y to separate files
np.save('/content/train_data_x.npy', tr_X)
np.save('/content/train_data_y.npy', tr_y)

# Load the testing .npy file
test_data = np.load('/content/house_test.npy')

# Split into X and y
te_X = test_data[1:, 1:-1]  # Assumes the last column contains the labels
te_y = test_data[1:, -1]   # Assumes the last column contains the labels
# Save X and y to separate files
np.save('/content/test_data_x.npy', te_X)
np.save('/content/test_data_y.npy', te_y)

# Load in the data
train_data_X = np.load('/content/train_data_x.npy',allow_pickle=True)
train_data_y = np.load('/content/train_data_y.npy',allow_pickle=True)
val_data_X = np.load('/content/test_data_x.npy',allow_pickle=True)
val_data_y = np.load('/content/test_data_y.npy',allow_pickle=True)
# # Print out sizes
print("Train data: %d examples (columns), each of which has %d dimensions (rows)"%((train_data_X.shape[1],train_data_X.shape[0])))
print("Validation data: %d examples (columns), each of which has %d dimensions (rows)"%((val_data_X.shape[1],val_data_X.shape[0])))

print(train_data_y[:10])

print(val_data_y[:10])

print(train_data_X[:10])

print(val_data_X[:10])

# np.delete(test_data_x.npy)
# np.delete(test_data_y.npy)
# np.delete(train_data_x.npy)
# np.delete(train_data_y.npy)

Train data: 79 examples (columns), each of which has 1460 dimensions (rows)
Validation data: 78 examples (columns), each of which has 1459 dimensions (rows)
[208500. 181500. 223500. 140000. 250000. 143000. 307000. 200000. 129900.
 118000.]
[nan nan nan nan nan nan nan nan nan nan]
[[6.0000e+01        nan 6.5000e+01 8.4500e+03        nan        nan
         nan        nan        nan        nan        nan        nan
         nan        nan        nan        nan 7.0000e+00 5.0000e+00
  2.0030e+03 2.0030e+03        nan        nan        nan        nan
         nan 1.9600e+02        nan        nan        nan        nan
         nan        nan        nan 7.0600e+02        nan 0.0000e+00
  1.5000e+02 8.5600e+02        nan        nan        nan        nan
  8.5600e+02 8.5400e+02 0.0000e+00 1.7100e+03 1.0000e+00 0.0000e+00
  2.0000e+00 1.0000e+00 3.0000e+00 1.0000e+00        nan 8.0000e+00
         nan 0.0000e+00        nan        nan 2.0030e+03        nan
  2.0000e+00 5.4800e+02        nan    

Define the network

In [None]:
# YOU SHOULD NOT CHANGE THIS CELL!

# There are 40 input dimensions and 10 output dimensions for this data
# The inputs correspond to the 40 offsets in the MNIST1D template.
D_i = 40
# The outputs correspond to the 10 digits
D_o = 10

# Number of hidden units in layers 1 and 2
D_1 = 100
D_2 = 100

# create model with two hidden layers
model = nn.Sequential(
nn.Linear(D_i, D_1),
nn.ReLU(),
nn.Linear(D_1, D_2),
nn.ReLU(),
nn.Linear(D_2, D_o))

In [None]:
# He initialization of weights
def weights_init(layer_in):
  if isinstance(layer_in, nn.Linear):
    nn.init.kaiming_uniform_(layer_in.weight)
    layer_in.bias.data.fill_(0.0)

In [None]:
# You need all this stuff to ensure that PyTorch is deterministic
def set_seed(seed):
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    np.random.seed(seed)
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)

In [None]:
# Set seed so always get same result (do not change)
set_seed(1)
# choose cross entropy loss function (equation 5.24 in the loss notes)
loss_function = nn.CrossEntropyLoss()




# construct SGD optimizer and initialize learning rate and momentum
optimizer = torch.optim.SGD(model.parameters(), lr = 0.05, momentum=0.9)


# object that decreases learning rate by half every 10 epochs
scheduler = StepLR(optimizer, step_size=10, gamma=0.5)



# create 100 dummy data points and store in data loader class
x_train = torch.tensor(train_data_x.transpose().astype('float32'))
print(x_train.shape)
y_train = torch.tensor(train_data_y.astype('long'))
print(y_train.shape)
x_val= torch.tensor(val_data_x.transpose().astype('float32'))
y_val = torch.tensor(val_data_y.astype('long'))

# load the data into a class that creates the batches
data_loader = DataLoader(TensorDataset(x_train,y_train), batch_size=100, shuffle=True, worker_init_fn=np.random.seed(1))

# Initialize model weights
model.apply(weights_init)

# loop over the dataset n_epoch times
n_epoch = 50
# store the loss and the % correct at each epoch
losses_train = np.zeros((n_epoch))
errors_train = np.zeros((n_epoch))
losses_val = np.zeros((n_epoch))
errors_val = np.zeros((n_epoch))

for epoch in range(n_epoch):
  # loop over batches
  for i, data in enumerate(data_loader):
    # retrieve inputs and labels for this batch
    x_batch, y_batch = data
    # zero the parameter gradients
    optimizer.zero_grad()
    # forward pass -- calculate model output
    pred = model(x_batch)
    # compute the lss
    loss = loss_function(pred, y_batch)
    # backward pass
    loss.backward()
    # SGD update
    optimizer.step()

  # Run whole dataset to get statistics -- normally wouldn't do this
  pred_train = model(x_train)
  pred_val = model(x_val)
  _, predicted_train_class = torch.max(pred_train.data, 1)
  _, predicted_val_class = torch.max(pred_val.data, 1)
  errors_train[epoch] = 100 - 100 * (predicted_train_class == y_train).float().sum() / len(y_train)
  errors_val[epoch]= 100 - 100 * (predicted_val_class == y_val).float().sum() / len(y_val)
  losses_train[epoch] = loss_function(pred_train, y_train).item()
  losses_val[epoch]= loss_function(pred_val, y_val).item()
  print(f'Epoch {epoch:5d}, train loss {losses_train[epoch]:.6f}, train error {errors_train[epoch]:3.2f},  val loss {losses_val[epoch]:.6f}, percent error {errors_val[epoch]:3.2f}')

  # tell scheduler to consider updating learning rate
  scheduler.step()

# Plot the results
fig, ax = plt.subplots()
ax.plot(errors_train,'r-',label='train')
ax.plot(errors_val,'b-',label='validation')
ax.set_ylim(0,100); ax.set_xlim(0,n_epoch)
ax.set_xlabel('Epoch'); ax.set_ylabel('Error')
ax.set_title('Part II: Validation Result %3.2f'%(errors_val[-1]))
ax.legend()
ax.plot([0,n_epoch],[37.45, 37.45],'k:') # Original results. You should be better than this!
plt.savefig('Coursework_II_Results.png',format='png')
plt.show()

In [None]:
# Leave this all commented for now
# We'll see how well you did on the test data after the coursework is submitted

# # I haven't given you this yet, leave commented
# test_data_x = np.load('test_data_x.npy')
# test_data_y = np.load('test_data_y.npy')
# x_test = torch.tensor(test_data_x.transpose().astype('float32'))
# y_test = torch.tensor(test_data_y.astype('long'))
# pred_test = model(x_test)
# _, predicted_test_class = torch.max(pred_test.data, 1)
# errors_test = 100 - 100 * (predicted_test_class == y_test).float().sum() / len(y_test)
# print("Test error = %3.3f"%(errors_test))