In [72]:
#Written by Jacob Clarke
#Homework 4
import math
import pandas as pd
import numpy as np
import copy
import random
%matplotlib inline
from matplotlib import pyplot as plt
from sklearn.preprocessing import MinMaxScaler

class ANN():
    num_hidden_layers = 1
    num_units_per_hidden_layer = 4
    network = None # initialized in __init_network
    
    def __init__(self, num_hidden_layers = 1,
                 num_units_per_hidden_layer = 4):
        self.num_hidden_layers = num_hidden_layers
        # Add one for the bias
        self.num_units_per_hidden_layer = num_units_per_hidden_layer + 1
    # end __init__
        
    def fit(self, X, y, alpha, t):
        # X is an (m, n) shaped numpy input matrix
        # y is a (m, 1) shaped numpy target/output values vector
        # alpha is the learning rate parameter
        # t is the number of iterations for the stochastic gradient descent
        # and it will be what the "repeat" statement in the pseudo-code
        # is based on
        
        #m is number of examples
        m = len(X)
        #n is the number of features
        n = len(X.columns)
        # Add a new 0th column to X with all ones for the bias coefficient
        # Use similar approach to how you did it for Homework 3
        #bcoefficient is the 0th column
        bcoefficient = np.full(shape=m, fill_value=1, dtype=np.int)
        #convert bcoefficient to Series to add to data frame
        bcoeffCol = pd.Series(bcoefficient)
        X = pd.concat([bcoeffCol, X], axis = 1)
        #update n because column length 
        n = n + 1
        
        # Call __back_prop_learning to train your network using back
        # propagation as described in the Russell and Norvig based
        # lectures and their pseudo-code
        self.__back_prop_learning(X, y, alpha, t)
    # end fit
    
    def predict(self, T):
        # Return the target/class probabilities (digits 0-10) for each
        # example in input T. Think of it as a (m, k) shaped numpy array
        # where m is the number of examples and k is the number of
        # nodes in the output layer (i.e. number of digits)
        
        #length is number of examples in T
        length = len(T)
        #temporarily fill the (m,k) with zeros
        prediction_array = np.zeros(shape=(length, 10))
        # for each example in T:
        #cols the a Series that holds the features
        cols = T.columns
        
        #for loop that iterates through each example row and looks at each index within it
        for index, row in T.iterrows():
            #set the input layer with information from example rows features
            for inputVal in range(len(cols)):
                x_k = row[cols[inputVal]]
                self.network.layers[0].activations[inputVal] = x_k
            #iterate through the hidden layers 
            for hiddenLayer in range(1, (self.num_hidden_layers + 2)):
                #loop through each node that weight connects to (next layer)
                for node_j in range(len(self.network.layers[hiddenLayer].activations)):
                    #array that will hold the weights that weights of the previous layer that connects the hiddenLayer(current layer being iterated through) 
                    temp_array = np.zeros(len(self.network.layers[hiddenLayer - 1].activations))
                    #loop through the previous layers nodes
                    for node_i in range(len(self.network.layers[hiddenLayer - 1].activations)):
                        temp_array[node_i] = self.network.weights[hiddenLayer -1][node_i][node_j]
                    #update the hiddenLayer's activation
                    self.network.layers[hiddenLayer].activations[node_j] = self.__activation_fn(temp_array, self.network.layers[hiddenLayer-1].activations[node_i])
            #loop through output layer 
            for outputLayer in range(self.num_hidden_layers + 1, self.num_hidden_layers + 2):
                #loop through node that holds probabilities for each digit
                for node_output in range(len(self.network.layers[outputLayer].activations)):
                    #addd the probailities to the (m,k) shaped array
                    prediction_array[index][node_output] = self.network.layers[outputLayer].activations[node_output]
        return prediction_array
    # end predict
    
    def print(self):
        # Print current weights from the input layer to the output layer
        for currentLayer in range(self.num_hidden_layers + 1):
            for node_i in range(len(self.network.layers[currentLayer].activations)):
                for node_j in range(len(self.network.layers[currentLayer + 1].activations)):
                    #prints the information about the network
                    print('Weight of Layer: ', currentLayer, 'Node: ',node_i," to Layer: ", currentLayer+1, 'Node: ',node_j, 'is: ',self.network.weights[currentLayer][node_i][node_j])
    # end print
    
    ## Private helper methods of the ANN class ##
    def __back_prop_learning(self, X, y, alpha, t):
        # Initialize starting weights to small random numbers according
        # to Xavier init. Link to Xavier paper:
        # http://proceedings.mlr.press/v9/glorot10a/glorot10a.pdf
        # It is important to initialize starting weight values in a
        # random manner as nodes/neurons in the same layer could receive
        # the same exact updates and end up always having the same weights
        
        # This will also initialize network structures and units/nodes
        self.__init_network(X, y)
        
        # Code according to Russell and Norvig pseudo-code starts here
        
        #cols is the Series that holds the features
        cols = X.columns
        #loops through the number of iterations that was passed to the method
        for i in range(t):
            #for loop that iterates through each example row and looks at each index within it
            for index, row in X.iterrows():
                #set the input layer with information from example rows features
                for node_i in range(len(cols)):
                    x_i = row[cols[node_i]]
                    self.network.layers[0].activations[node_i] = x_i
                #iterate through the hidden layers 
                for hiddenLayer in range(1, (self.num_hidden_layers + 2)):
                    #loop through each node that weight connects to (next layer)
                    for node_j in range(len(self.network.layers[hiddenLayer].activations)):
                        #array that will hold the weights that weights of the previous layer that connects the hiddenLayer(current layer being iterated through) 
                        temp_array = np.zeros(len(self.network.layers[hiddenLayer - 1].activations))
                        #loop through the previous layers nodes
                        for node_i in range(len(self.network.layers[hiddenLayer - 1].activations)):
                            temp_array[node_i] = self.network.weights[hiddenLayer -1][node_i][node_j]
                        #update the hiddenLayer's activation
                        self.network.layers[hiddenLayer].activations[node_j] = self.__activation_fn(temp_array, self.network.layers[hiddenLayer-1].activations[node_i])
                #loop through the output layer nodes
                for node_j in range(len(self.network.layers[self.num_hidden_layers + 1].activations)):
                    #temporarily fill digits array with zeros
                    y_j = np.zeros(10)
                    #if current node in output layer is equal to the class value that corresponds with current example row 
                    if node_j == y[index]:
                        #set the digits array value to 1
                        y_j[node_j] = 1
                    #use the gprime formula, instead of redoing the g function use activation value which holds the g fucntion value
                    g_prime = self.network.layers[self.num_hidden_layers + 1].activations[node_j]*(1 - self.network.layers[self.num_hidden_layers + 1].activations[node_j])
                    #update the error using the digits array
                    self.network.layers[self.num_hidden_layers + 1].deltas[node_j] = (g_prime)*(y_j[node_j] - self.network.layers[self.num_hidden_layers + 1].activations[node_j])
                #propogate backward starting at last hidden layer
                for currentLayer in range((self.num_hidden_layers), 1, -1):
                    #node of the previous layer(propogate bacward)
                    for node_i in range(len(self.network.layers[currentLayer].activations)):
                        #array that will hold the weights that weights of the layer before(the one closer to the output layer) 
                        temp_array = np.zeros(len(self.network.layers[currentLayer+1].activations))
                        #calculate gprime
                        g_prime = self.network.layers[currentLayer].activations[node_i]*(1 - self.network.layers[currentLayer].activations[node_i])
                        #loop through the layer before(the layer closer to the output layer)
                        for node_j in range(len(self.network.layers[currentLayer+1].activations)):
                            temp_array[node_j] = self.network.weights[currentLayer -1][node_i][node_j]
                        #update the currentLayer's activation
                        self.network.layers[currentLayer].deltas[node_i] = (g_prime)*(np.sum(np.dot(temp_array, self.network.layers[currentLayer+1].activations[node_j])))
                #loop through each layers nodes
                for currentLayer in range(self.num_hidden_layers + 1):
                    for node_i in range(len(self.network.layers[currentLayer].activations)):
                        for node_j in range(len(self.network.layers[currentLayer+1].activations)):
                            #update the weights 
                            self.network.weights[currentLayer][node_i][node_j] = (self.network.weights[currentLayer][node_i][node_j]) + (alpha*self.network.layers[currentLayer].activations[node_i]*self.network.layers[currentLayer+1].deltas[node_j])
                                                                                         

            # end for each example
        # end for each iteration
    # end __back_prop_learning
    def __init_network(self, X, y):
        num_features = X.shape[1]
        self.num_classes = len(np.unique(y))
        self.network = Network(num_features, self.num_hidden_layers,
                               self.num_units_per_hidden_layer,
                               self.num_classes)

        # Initialize the weights based on the Xavier initialization method
        # for all hidden layers
        for u in range(len(self.network.weights)):
            num_in = len(self.network.weights[u])
            num_out = len(self.network.weights[u][0])
            xavier = math.sqrt(6 / (num_in + num_out))

            for j in range(len(self.network.weights[u])):
                for k in range(len(self.network.weights[u][j])):
                    self.network.weights[u][j][k] = random.uniform(
                        -xavier, xavier)
    # end __init_network
    
    def __activation_fn(self, weights, activations):
        # Use the sigmoid function as your activation function
        hxtemp = np.dot(weights,activations)
        
        hxtemp = np.sum(hxtemp)
    
        #logistic regressions hypothesis formula
        hx = 1/(1+(np.exp(-(hxtemp))))
        
        return hx
        
    # end __activation_fn
# end ANN

class Network():
    layers = None
    weights = None

    def __init__(self, num_features, num_hidden_layers,
                 num_units_per_hidden_layer, num_classes):
        self.layers = np.empty(num_hidden_layers + 2, dtype=object)
        self.weights = np.empty(num_hidden_layers + 1, dtype=object)
        
        for i in range(len(self.layers)):
            if i == 0: # input layer
                self.layers[i] = Layer(num_features)
            elif i == (len(self.layers) - 1): # output layer
                self.layers[i] = Layer(num_classes)
            else: # hidden layer
                self.layers[i] = Layer(num_units_per_hidden_layer)

        for i in range(len(self.weights)):
            rows = len(self.layers[i].activations)
            columns = len(self.layers[i + 1].activations)
            self.weights[i] = np.zeros(
                shape=(rows, columns), dtype=object)    
    # end __init__
# end Network

class Layer():
    activations = None
    deltas = None
    
    def __init__(self, units):
        self.activations = np.ones(units)
        self.deltas = np.ones(units)
    # end __init__
# end Layer

def accuracy(ann, data_set):
 # Use a similar approach to your Homework 2 solution
 # Keep in mind that ann.predict returns a (m, k) shaped
    #length is the number of examples in the data_set
    length = len(data_set)
    #attributes is a series of attributes/features in the data_set
    attributes = data_set.columns
    #numerator will be the number of correct predictions from ann
    numerator = 0
    #denominator is the total number of examples
    denominator = length
    #percentage will be the percentage of correct predictions
    percentage = 0
    
    #loop through the size of the data_set (number of examples)
    for i in range(length):
        #predicted will hold the digit with the highest probability (the prediction produced by the ann)
        predicted = 0
        #loop through the ann and find what digit has the highest probability
        for x in range(len(ann[i])):
            if x == 0:
                predicted = ann[i][x]
            elif ann[i][x] > ann[i][x-1]:
                predicted = x
        #if the predicted = the true class value than increment numerator
        if (data_set.iloc[i, 0] == predicted):
            numerator = numerator + 1
    #in case numerator = 0, this prevents an error from occuring
    if numerator != 0:
        percentage = numerator/denominator
    
    return percentage
 # numpy array
# end accuracy

def validation_curve():
 # Split your data set into y and X\
    df = pd.read_csv('train.csv')
    #y will hold the label/class values in a series
    y = pd.Series(df['label'])
    #X is the dataFrame without the class/label column
    X = df.drop(['label'], axis = 1)
    #cols is a series that will hold the attributes/features
    cols = X.columns
    
    # Normalize X using sklearn MinMaxScaler
    scaler = MinMaxScaler()
    X = scaler.fit_transform(X)
    X = pd.DataFrame(X,columns = cols)
    # Use pandas concat to put back together y, X_scaled so that
    # your validation code can do the partitions
    df = pd.concat([y,X], axis = 1)
    #shuffle the data frame
    df = df.sample(frac=1)
    

    #shrink the df
    df = df[:2000]
    
    #lengthData is the length of df
    lengthData = len(df)
    
    #length of partitions to be made
    partition1Length = lengthData/3
    partition1Length = int(partition1Length)
    partition2Length = int((lengthData-partition1Length)/2)
    partition3Length = int(lengthData - (partition1Length+partition2Length))
    partList = [partition1Length, partition2Length, partition3Length]
    
    #where to separate each partition
    part1 = partition1Length

    part2 = (partition1Length+partition2Length)

    part3 = part2 + partition3Length

    #dataframes of equal size
    partition1Frame = df[0:part1]
    partition2Frame = df[part1:part2]
    partition3Frame = df[part2:part3]
    
    #list of what combined trainingsets will be
    t1 = [partition1Frame, partition2Frame]
    t2 = [partition2Frame, partition3Frame]
    t3 = [partition1Frame, partition3Frame]

    #building of trainingsets and validation sets
    training_set1 = pd.concat(t1)
    training_set1 = training_set1.reset_index()
    ts1_y = pd.Series(training_set1['label'])
    training_set1NoClass = training_set1.drop(['label'], axis = 1)
    validation_set1 = partition3Frame
    validation_set1 = validation_set1.reset_index()
    vs1_y = pd.Series(validation_set1['label'])
    validation_set1NoClass = validation_set1.drop(['label'], axis = 1)
    
    training_set2 = pd.concat(t2)
    training_set2 = training_set2.reset_index()
    ts2_y = pd.Series(training_set2['label'])
    training_set2NoClass = training_set2.drop(['label'], axis = 1)
    validation_set2 = partition1Frame
    validation_set2 = validation_set2.reset_index()
    vs2_y = pd.Series(validation_set2['label'])
    validation_set2NoClass = validation_set2.drop(['label'], axis = 1)
    
    training_set3 = pd.concat(t3)
    training_set3 = training_set3.reset_index()
    ts3_y = pd.Series(training_set3['label'])
    training_set3NoClass = training_set3.drop(['label'], axis = 1)
    validation_set3 = partition2Frame
    validation_set3 = validation_set3.reset_index()
    vs3_y = pd.Series(validation_set3['label'])
    validation_set3NoClass = validation_set3.drop(['label'], axis = 1)
    
    #numHiddenLayers is an array that will hold the steps of the number of hidden layers
    numHiddenLayers = np.arange(2,21,2)
    
    #lists that will hold the percentage correct for each each data set, for each step of hiddenLayers
    perTs1 = np.zeros(len(numHiddenLayers))
    perVs1 = np.zeros(len(numHiddenLayers))
    
    perTs2 = np.zeros(len(numHiddenLayers))
    perVs2 = np.zeros(len(numHiddenLayers))
    
    perTs3 = np.zeros(len(numHiddenLayers))
    perVs3 = np.zeros(len(numHiddenLayers))
 
    #loop that goes through the steps of hiddenLayers
    for layers in range(len(numHiddenLayers)):
        
        #Empty ANN created adjusting the number of hidden layers
        ANN1 = ANN(numHiddenLayers[layers], len(training_set1NoClass.columns))
        ANN2 = ANN(numHiddenLayers[layers], len(training_set2NoClass.columns))
        ANN3 = ANN(numHiddenLayers[layers], len(training_set3NoClass.columns))

        #training and predictions for ts1 and vs1
        ANN1.fit(training_set1NoClass, ts1_y, .2, 10)
        
        #creates an array of predictions
        predictionTs1 = ANN1.predict(training_set1NoClass)
        predictionVs1 = ANN1.predict(validation_set1NoClass)
        
        #accuracies for ts1 and vs1
        perTs1[layers] = (accuracy(predictionTs1,training_set1))
        perVs1[layers] = (accuracy(predictionVs1,validation_set1))
        
        #training and predictions for ts2 and vs2
        ANN2.fit(training_set2NoClass, ts2_y, .2, 10)
        
        #creates an array of predictions
        predictionTs2 = ANN2.predict(training_set2NoClass)
        predictionVs2 = ANN2.predict(validation_set2NoClass)
        
        #accuracies for ts2 and vs2
        perTs2[layers] = (accuracy(predictionTs2,training_set2))
        perVs2[layers] = (accuracy(predictionVs2,validation_set2))

        #training and predictions for ts3 and vs3
        ANN3.fit(training_set3NoClass, ts3_y, .2, 10)
        
        #creates an array of predictions
        predictionTs3 = ANN3.predict(training_set3NoClass)
        predictionVs3 = ANN3.predict(validation_set3NoClass)
        
        #accuracies for ts3 and vs3
        perTs3[layers] = (accuracy(predictionTs3,training_set3))
        perVs3[layers] = (accuracy(predictionVs3,validation_set3))
    
    #arrays that will hold the average scores from the training sets and validation sets 
    AvgAccTs = np.zeros(len(numHiddenLayers))
    AvgAccVs = np.zeros(len(numHiddenLayers))
    for t in range(len(numHiddenLayers)):
        temp1 = perTs1[t] + perTs2[t] + perTs3[t]
        temp1 = (temp1)/3
        AvgAccTs[t] = temp1
        
        temp2 = perVs1[t] + perVs2[t] + perVs3[t]
        temp2 = (temp2)/3
        AvgAccVs[t] = temp2

    t = plt.figure
    plt.plot(numHiddenLayers, AvgAccTs) #plot the graph
    plt.plot(numHiddenLayers, AvgAccVs)
    plt.title("Validation Curve (3-Fold Cross-Validation)") #add a title
    plt.xlabel("Number of Units in Hidden Layer") #label x axis
    plt.ylabel("Average Accuracy") #lavel y axis

    plt.legend(['Average Training Set', 'Average Validation Set'], loc='upper left')
    plt.show()
    t.savefig("Clarke_homework4.pdf",bbox_inches='tight')
    
 # Use a similar approach to your Homework 2 to do
 # 3-fold cross-validation except that here you are
 # iterating over the number of nodes in your only
 # hidden layer from 2 to 20 in steps of 2
# end validation_curve
validation_curve()



KeyboardInterrupt: 