In [789]:
import numpy as np
import time
import pandas as pd
import matplotlib.pyplot as plt

In [790]:
# Training Data
x_train = np.array(pd.read_csv('training_set.csv', header=None).values)
y_train = np.array(pd.read_csv('training_labels_bin.csv', header=None).values)
x_val = np.array(pd.read_csv('validation_set.csv', header=None).values)
y_val = np.array(pd.read_csv('validation_labels_bin.csv', header=None).values)

In [791]:
N = len(x_train)
M = len(x_val)
print(N)
print(len(y_train))
print(M)
print(len(y_val))

8442
8442
1048
1048


In [792]:
num_feats = x_train.shape[1]
n_out = y_train.shape[1]
print(num_feats)
print(n_out)

354
3


In [793]:
# add room for bias
bias_train = np.ones((x_train.shape[0], 1))
bias_val = np.ones((x_val.shape[0], 1))
x_train = np.append(x_train, bias_train, axis = 1)
x_val = np.append(x_val, bias_val, axis = 1)

In [794]:
def sigmoid(x):
  return 1/(1+np.exp(-x))

def grad_sigmoid(x):
  return sigmoid(x)*(1-sigmoid(x))

def squared_error(guess, correct):
  return sum((guess-correct)**2)

def grad_squared_error(guess, correct):
  grad_vec = np.zeros(len(guess))
  for j in range(len(guess)):
    grad_vec[j] = 2*(guess[j]-correct[j])
  return np.reshape(grad_vec, (1, len(grad_vec)))

# fully connected, 2 hidden layers, vector of size 3 output
# we assume an input of 1x355 with the last entry a 1
class MLP:
  def __init__(self):
    # initialize the hidden layers
    W1 = np.random.uniform(-1, 1, num_feats*num_feats).reshape((num_feats, num_feats))
    W2 = np.random.uniform(-1, 1, num_feats*num_feats).reshape((num_feats, num_feats))
    W3 = np.random.uniform(-1, 1, num_feats*3).reshape((num_feats, 3))

    # put the bias into the first weight matrix so the addition is a simply result of matrix multiplication
    # initialize a bias
    bias = np.random.uniform(-1, 1, 354)
    W1 = np.append(W1, np.array([bias]), axis = 0)

    self.w = [W1, W2, W3]
    self.a = [np.zeros((1, 354)), np.zeros((1, 354)), np.zeros((1, 3))]
    self.h = [np.zeros((1, 354)), np.zeros((1, 354)), np.zeros((1, 3))]

    self.grad_table = [W1, W2, W3]
    return
  
  def forward_pass(self, src, sink):
    src = np.reshape(src, (1, 355))
    sink = np.reshape(sink, (1, 3))

    self.a[0] = np.matmul(src, self.w[0])
    self.h[0] = sigmoid(self.a[0])
    self.a[1] = np.matmul(self.h[0], self.w[1])
    self.h[1] = sigmoid(self.a[1])
    self.a[2] = np.matmul(self.h[1], self.w[2])
    self.h[2] = sigmoid(self.a[2])

    self.J = squared_error(self.h[2][0], sink[0])

  def backprop(self, src, sink, lr):
    sink = np.reshape(sink, (1, 3))
    src = np.reshape(src, (1, 355))

    g = grad_squared_error(self.h[2][0], sink[0])
    for i in range(2, -1, -1):
      g = g*grad_sigmoid(self.a[i])
      if i==0:
        self.grad_table[i] = np.matmul(src.T, g)
      else:
        self.grad_table[i] = np.matmul(self.h[i-1].T, g)
      g = np.matmul(g, self.w[i].T)
      self.w[i] -= lr*self.grad_table[i]

In [797]:
# hyperparameters (you may change these)
eta = 0.1 # intial learning rate
gamma = 0.1 # multiplier for the learning rate
stepsize = 20 # epochs before changing learning rate
threshold = 0.01 # stopping criterion
test_interval = 5 # number of epoch before validating
max_epoch = 75

In [None]:
perceptron = MLP()

for epoch in range(1, max_epoch):
    
    order = np.random.permutation(N) # shuffle data
    
    sse = 0
    for n in range(0, N):
        idx = order[n]

        # get a sample (batch size=1)
        x_in = x_train[idx]
        y = y_train[idx]

        perceptron.forward_pass(x_in, y)
        perceptron.backprop(x_in, y, eta)

        sse += perceptron.J

    train_mse = sse/N
    print("Training:", train_mse)

    if epoch % test_interval == 0 or epoch == 1: 
        # [ ] test on validation set here
        sse = 0
        for m in range(0, M):
          perceptron.forward_pass(x_val[m], y_val[m])
          sse+=perceptron.J
        
        val_mse = sse/M
        print("Validation: ", val_mse)
        # if termination condition is satisfied, exit
        if val_mse < threshold:
            break

    if epoch % stepsize == 0 and epoch != 0:
        eta = eta*gamma
        print('Changed learning rate to lr=' + str(eta))

Training: 1.1586270582772364
Validation:  1.435851716962441
Training: 0.6467186027745676
Training: 0.4243938414746
Training: 0.3337393938938933
Training: 0.26841682459424687
Validation:  0.2582387891413158
Training: 0.22466100361668256
Training: 0.18607037669890622
Training: 0.17219506437123536
Training: 0.15644343563428595
Training: 0.12999215947158152
Validation:  0.158086053257577
Training: 0.12120974418514442
Training: 0.10950199472976313
Training: 0.1067527023738637
Training: 0.08630176984409145
Training: 0.08569277251318515
Validation:  0.17890504310675995
Training: 0.07842624161900756
Training: 0.06879771500205109
Training: 0.06681009145105594
Training: 0.06211205883779723
Training: 0.05790321198958399
Validation:  0.11217622282174593
Changed learning rate to lr=0.010000000000000002
Training: 0.024203628745038456
Training: 0.019740226162941743
Training: 0.018785988588258723
Training: 0.01763048573179454
Training: 0.01745777732997442
Validation:  0.06917558930140202
Training: 0.0