# 1 Neural Networks using Numpy [14 pts.]

## 1.1 Helper Functions [4 pts]

1. ReLU(): This function will accept one argument and return Numpy array with the ReLU
activation and the equation is given below. [0.5 pt]
ReLU(x) = max(x, 0)
2. softmax(): This function will accept one argument and return a Numpy array with the
softmax activations of each of the inputs and the equation is shown below. [0.5 pt]
3. compute(): This function will accept 3 arguments: a weight, an input, and a bias matrix
and return the product between the weights and input, plus the biases (i.e. a prediction for
a given layer). [0.5 pt]



In [1]:
import numpy as np
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt
import time
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'

# Load the data
def loadData():
    with np.load("notMNIST.npz") as data:
        Data, Target = data["images"], data["labels"]
        np.random.seed(521)
        randIndx = np.arange(len(Data))
        np.random.shuffle(randIndx)
        Data = Data[randIndx] / 255.0
        Target = Target[randIndx]
        trainData, trainTarget = Data[:10000], Target[:10000]
        validData, validTarget = Data[10000:16000], Target[10000:16000]
        testData, testTarget = Data[16000:], Target[16000:]
    return trainData, validData, testData, trainTarget, validTarget, testTarget

# Implementation of a neural network using only Numpy - trained using gradient descent with momentum
def convertOneHot(trainTarget, validTarget, testTarget):
    newtrain = np.zeros((trainTarget.shape[0], 10))
    newvalid = np.zeros((validTarget.shape[0], 10))
    newtest = np.zeros((testTarget.shape[0], 10))

    for item in range(0, trainTarget.shape[0]):
        newtrain[item][trainTarget[item]] = 1
    for item in range(0, validTarget.shape[0]):
        newvalid[item][validTarget[item]] = 1
    for item in range(0, testTarget.shape[0]):
        newtest[item][testTarget[item]] = 1
    return newtrain, newvalid, newtest


def shuffle(trainData, trainTarget):
    np.random.seed(421)
    randIndx = np.arange(len(trainData))
    target = trainTarget
    np.random.shuffle(randIndx)
    data, target = trainData[randIndx], target[randIndx]
    return data, target

  from ._conv import register_converters as _register_converters


In [23]:
trainData, validData, testData, trainTarget, validTarget, testTarget = loadData()
trainTarget, validTarget, testTarget = convertOneHot(trainTarget, validTarget, testTarget)
print(trainData.shape)
print(trainTarget[1])

(10000, 28, 28)
[0. 0. 0. 0. 0. 0. 0. 0. 0. 1.]


In [2]:
def ReLu(x):
    x[x<0] = 0
    return np.array(x)

In [61]:
def gradReLu(x):
    x[x>0] = 1
    x[x<=0] = 0
    return x

In [145]:
def softmax(x):
    x = np.exp(x)/np.sum(np.exp(x))
    return x

In [146]:
def compute(X, W, b):
    return np.matmul(X, W) + b

In [147]:
def averageCE(target, prediction):
    prediction = softmax(prediction)
    total = np.sum(target * np.log(prediction))
    avg = total * (-1/len(target))
    return avg

In [148]:
def gradCE(target, prediction):
    total = np.sum(target * np.divide(1,softmax(prediction)))
    gradCE = total * (-1/len(target))
    return gradCE

In [158]:
def gradSoftmax(s):
    s = softmax(s).reshape(-1, 1)
    print("in softmax", s.shape)
    return np.diagflat(s) - np.dot(s, np.transpose(s))

## 1.2 Backpropagation Derivation [4 pts.]
To train the neural network, you will need to implement the backpropagation algorithm. For the
neural network architecture outlined in the assignment description, derive the following analytical
expressions and include them in your report:

In [159]:
class myNN():
    def __init__(self, num_hidden):
        super(myNN, self).__init__()
        
        n_x = 784 # size of input layer`
        k = num_hidden
        n_y = 10 # size of output layer
        
        #initialize parameters
        self.W1 = np.random.randn(n_x, k) * (2 / (n_x + k))**(1/2.0) #Wh
        self.b1 = np.zeros(shape=(1,k))
        self.W2 = np.random.randn(k,n_y) * (2 / (k + n_y))**(1/2.0) #Wo
        self.b2 = np.zeros(shape=(1,n_y))
        
        #initialize hidden units and signals
        self.X0 = np.zeros(shape=(1,n_x))
        self.X1 = np.zeros(shape=(1,k))
        self.X2 = np.zeros(shape=(1,n_y))
        self.S1 = np.zeros(shape=(1,k))
        self.S2 = np.zeros(shape=(1,n_y))
        
        #for back propagation
        self.V2_w = np.full((n_x, k), 1e-5)
        self.V1_w = np.full((k, n_y), 1e-5)
        self.V2_b = np.full((1, n_y), 1e-5)
        self.V1_b = np.full((1, k), 1e-5)
        self.gamma = 0.9
        self.alpha = 1 - self.gamma
        
    def forward(self, img):
        self.X0 = img
        # Implement Forward Propagation to calculate A2 (probabilities)
#         S1 = np.dot(W1,X0) + b1 # replace with compute
        self.S1 = compute(self.X0, self.W1, self.b1)
#         X1 = np.tanh(S1) #replace with relu
        self.X1 = ReLu(self.S1)
#         S2 = np.dot(W2,X1) + b2 # replace with compute
        self.S2 = compute(self.X1, self.W2, self.b2)
#         X2 = sigmoid(S2) # replace with SOFTMAX
        self.X2 = softmax(self.S2)
    
        return self.X2
    
    def backward(self, target):
        print("delta2:", target.shape, self.X2.shape, self.S2.shape)
        print(gradCE(target, self.X2).shape, gradSoftmax(self.S2).shape)
        delta2 = np.multiply(gradCE(target, self.X2),gradSoftmax(self.S2))
        print(delta2.shape, self.S1.shape, self.W2.shape)
        delta1 = np.multiply(np.matmul(self.W2,delta2), gradReLu(self.S1))
        
        d_W2 = np.matmul(self.X1.T,delta2) 
        d_b2 = delta2
        d_W1 = np.matmul(self.X0.T,delta1)
        d_b1 = delta1
        
        self.V2_w = self.gamma * self.V2_w + self.alpha * d_W2
        self.V2_b = self.gamma * self.V2_b + self.alpha * d_b2
        
        self.V1_w = self.gamma * self.V1_w + self.alpha * d_W1
        self.V1_b = self.gamma * self.V1_b + self.alpha * d_b1
        return 
    def update(self):
        self.W1 = self.W1 = self.V1_w
        self.b1 = self.b1 = self.V1_b
        self.W2 = self.W2 = self.V2_w
        self.b2 = self.b2 = self.V2_b
        return

In [160]:
def get_accuracy(model, data, target):
    correct = 0
    total = 0
    for image, label in zip(data, target):
        output = model(image)  
        pred = np.argmax(output)
        correct += pred.eq(label.view_as(pred)).sum().item()       
        total += img.shape[0]
    return correct / total

In [161]:
def train(model,trainData, validData, testData, trainTarget, validTarget, testTarget, num_epochs = 10):
    # initialize all the weights
    losses, valid_losses, train_acc, valid_acc = [], [], [], []
    epochs = []
    
    assert trainData.shape[0] == 10000
    
    for epoch in range(num_epochs):
        train_loss = 0
        for image, label in zip(trainData, trainTarget):
            label = np.reshape(label, (1, 10))
            print("sup")
            image = np.reshape(image, (1, 784))
            
            
            pred = model.forward(image)
            print(pred.shape)
            print(label.shape)
            loss = averageCE(label, pred)
            train_loss += loss/(trainData.shape[0])
            model.backward(label)
            
            model.update()
    
    
        valid_loss = 0
        for image, label in zip(validData, validTarget):
            image = np.reshape(image, (1, 784))
            label = label.T
            
            pred = model(image)
            valid_loss += averageCE(label, pred)/(validData.shape[0])
            
        valid_losses.append(float(valid_loss)) 
            
        losses.append(float(train_loss))

        epochs.append(epoch)
        train_acc.append(get_accuracy(model, trainData,trainTarget ))
        valid_acc.append(get_accuracy(model, validData, validTarget))
        print("Epoch %d; Loss %f; Train Acc %f; Val Acc %f" % (
              epoch+1, loss, train_acc[-1], valid_acc[-1]))
        
     # plotting
    plt.title("Training Curve")
    plt.plot(epochs, losses, label="Train")
    plt.plot(epochs, valid_losses, label="Validation")
    plt.xlabel("Epoch")
    plt.ylabel("Loss")
    plt.show()

    plt.title("Training Curve")
    plt.plot(epochs, train_acc, label="Train")
    plt.plot(epochs, valid_acc, label="Validation")
    plt.xlabel("Epoch")
    plt.ylabel("Accuracy")
    plt.legend(loc='best')
    plt.show()

In [162]:
model1 = myNN(1000)
train(model1,trainData, validData, testData, trainTarget, validTarget, testTarget, num_epochs = 10)


sup
(1, 10)
(1, 10)
delta2: (1, 10) (1, 10) (1, 10)
in softmax (10, 1)
() (10, 10)
in softmax (10, 1)
(10, 10) (1, 1000) (1000, 10)


ValueError: operands could not be broadcast together with shapes (1000,10) (1,1000) 