In [33]:
#google drive mount to colab statement
import os
from google.colab import drive

drive.mount('/content/drive')
root = '/content/drive/My Drive/_Y3/ECE421_W/A2'
os.chdir(root)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [34]:
%tensorflow_version 1.x
#starter code given below

import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt
import time
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'

# Load the data
def loadData():
    with np.load("notMNIST.npz") as data:
        Data, Target = data["images"], data["labels"]
        np.random.seed(521)
        randIndx = np.arange(len(Data))
        np.random.shuffle(randIndx)
        Data = Data[randIndx] / 255.0
        Target = Target[randIndx]
        trainData, trainTarget = Data[:10000], Target[:10000]
        validData, validTarget = Data[10000:16000], Target[10000:16000]
        testData, testTarget = Data[16000:], Target[16000:]
    return trainData, validData, testData, trainTarget, validTarget, testTarget


# Implementation of a neural network using only Numpy - trained using gradient descent with momentum
def convertOneHot(trainTarget, validTarget, testTarget):
    newtrain = np.zeros((trainTarget.shape[0], 10))
    newvalid = np.zeros((validTarget.shape[0], 10))
    newtest = np.zeros((testTarget.shape[0], 10))

    for item in range(0, trainTarget.shape[0]):
        newtrain[item][trainTarget[item]] = 1
    for item in range(0, validTarget.shape[0]):
        newvalid[item][validTarget[item]] = 1
    for item in range(0, testTarget.shape[0]):
        newtest[item][testTarget[item]] = 1
    return newtrain, newvalid, newtest


def shuffle(trainData, trainTarget):
    np.random.seed(421)
    randIndx = np.arange(len(trainData))
    target = trainTarget
    np.random.shuffle(randIndx)
    data, target = trainData[randIndx], target[randIndx]
    return data, target

# 1.1 Helper Functions

In [37]:
def relu(x):
  return (np.maximum(x, 0))

def softmax(x):
  max_elem = np.amax(x,axis=1) #for overflow purposes
  new_x = x - max_elem.reshape(-1,1)
  numerator = np.exp(new_x)
  denominator = np.sum(np.exp(new_x),axis=1,keepdims=True)
  return (numerator/denominator)

def computeLayer(X, W, b):
  y = np.matmul(X, W) + b
  return y

def CE(target, prediction):
  y_logp = target * np.log(prediction)
  persample_loss = np.sum(y_logp, axis = 1)
  ave_loss = -np.mean(persample_loss)
  return ave_loss

def gradCE(target, prediction):
  return (prediction-target)

#1.2 Backpropagation Derivation

In [None]:
def backprop(x, x_h, W_o, target, prediction):

  gCE = gradCE(target, prediction) #with respect to inputs to the softmax function
  dwO = np.dot(np.transpose(x_h), gCE)
  dbO_term1 = np.ones((1, target.shape[0]))
  dbO = np.matmul(dbO_term1, gCE)

  dxHdz = relugrad(x_h)
  dwH_term1 = np.matmul(gCE, np.transpose(W_o))
  dwH_term2 = dxHdz * dwH_term1
  dwH = np.matmul(np.transpose(x), dwH_term2)
  dbH_term1 = np.ones((1, x_h.shape[0]))
  dbH = np.matmul(dbH_term1, dwH_term2)
  #shape verification
  '''print("shape of dL/dWo: %s" % (dwO.shape,))
  print("shape of dL/dbo: %s" % (dbO.shape,))
  print("shape of dL/dWH: %s" % (dwH.shape,))
  print("shape of dL/dbH: %s" % (dbH.shape,))'''
  return dwO, dbO, dwH, dbH

#1.3 Learning

In [None]:
#helper functions:

##gradient of ReLU
def relugrad(x):
  y = np.where(x>0, 1, 0)
  return y

##accuracy function
def acc(prediction, y):
  #need to extract maximum probability of each class
  max_pred = np.argmax(prediction, axis = 1)
  max_y = np.argmax(y, axis = 1)
  correct = (max_pred == max_y).sum()
  return correct/y.shape[0]

In [None]:
def train(epochs = 200, hidden_size = 1000, alpha = 0.1*1e-4, gamma = 0.99):
    # Your implementation here

    np.random.seed(328)

    trainData, validData, testData, trainTarget, validTarget, testTarget = loadData()

    trainTarget, validTarget, testTarget = convertOneHot(trainTarget, validTarget, testTarget)
    trainData = trainData.reshape(trainData.shape[0], trainData.shape[1] * trainData.shape[2])
    validData = validData.reshape(validData.shape[0], validData.shape[1] * validData.shape[2])
    testData = testData.reshape(testData.shape[0], testData.shape[1] * testData.shape[2])
    
    N = trainData.shape[0] #10000 samples
    K = trainTarget[0].shape[0] #10 classes
    F = trainData[0].shape[0] #28x28 = 784 features

    W_o = np.random.normal(0, 2/(hidden_size + K), (hidden_size, K))
    b_o = np.zeros((1, K))
    W_h = np.random.normal(0, 2/(F+hidden_size), (F, hidden_size))
    b_h = np.zeros((1, hidden_size))

    v_W_o = np.full((hidden_size, K), 1e-5)
    v_b_o = np.full((1, K), 1e-5)
    v_W_h = np.full((F, hidden_size), 1e-5)
    v_b_h = np.full((1, hidden_size), 1e-5)

    trainLoss = []
    validLoss = []
    testLoss = []
    trainAcc = []
    validAcc = []
    testAcc = []
    
    for i in range(epochs):
      if i%25 == 0:
        print("epoch:", i)
      z_h = computeLayer(trainData, W_h, b_h)
      x_h = relu(z_h)
      z_o = computeLayer(x_h, W_o, b_o)
      prediction = softmax(z_o)

      Vz_h = computeLayer(validData, W_h, b_h)
      Vx_h = relu(Vz_h)
      Vz_o = computeLayer(Vx_h, W_o, b_o)
      Vprediction = softmax(Vz_o)

      Tez_h = computeLayer(testData, W_h, b_h)
      Tex_h = relu(Tez_h)
      Tez_o = computeLayer(Tex_h, W_o, b_o)
      Teprediction = softmax(Tez_o)

      trainLoss.append(CE(trainTarget, prediction))
      validLoss.append(CE(validTarget, Vprediction))
      testLoss.append(CE(testTarget, Teprediction))

      trainAcc.append(acc(prediction, trainTarget))
      validAcc.append(acc(Vprediction, validTarget))
      testAcc.append(acc(Teprediction, testTarget))

      dwO, dbO, dwH, dbH = backprop(trainData, x_h, W_o, trainTarget, prediction)
      
      v_W_o = gamma*v_W_o + alpha*dwO
      v_b_o = gamma*v_b_o + alpha*dbO
      v_W_h = gamma*v_W_h + alpha*dwH
      v_b_h = gamma*v_b_h + alpha*dbH

      W_o = W_o - v_W_o
      b_o = b_o - v_b_o
      W_h = W_h - v_W_h
      b_h = b_h - v_b_h
      
    plot(trainLoss, validLoss, 1) #note: just for plotting purposes
    plot(trainAcc, validAcc, 2) 
    return W_o, b_o

def plot(train, valid, fignum):
  plt.figure(fignum)
  plot1, = plt.plot(train)
  plot2, = plt.plot(valid)
  plt.xlabel('Epochs')
  if fignum == 1:
    plt.ylabel('Loss')
    plt.title('Loss vs. Epochs')
  elif fignum == 2:
    plt.ylabel('Accuracy')
    plt.title('Accuracy vs. Epochs')
  plt.legend([plot1,plot2], ['Training', 'Validation'])
  plt.show()
  return True