In [131]:
import numpy as np
import pandas as pd

In [132]:
class StandardScaler:
    def __init__(self):
        pass
    def fit(self,X_train): ##fit X_train, calculate mean, std of X_train
        self.u = X_train.mean(axis=0)
        self.s = X_train.std(axis=0)
    def transform(self,X):#apply normlization on X_train or X_test
        X_norm = (X-self.u)/(self.s+1e-6)## z = (x-u)/s, check: will self.s = 0 ?
        return X_norm

In [133]:
class one_hot_encoder:
    def __init__(self):
        pass
    def fit(self,y_train):
    #fit y_train, find set{y_train} and assign elements of this set with one-hot-vectors.
       label_set = set(y_train)
    #assign index to labels:
       self.label_list = list(label_set)
    
    def transform(self,y): #apply one-hot-encoding to y_train or y_test
        #the shape of output matrix is (y.shape[0],len(self.label_list))
        ohe_matrix = np.zeros((y.shape[0],len(self.label_list)))
        for label,vector in zip(y,ohe_matrix):
            #find index of label and add 1 to vector[index]
            index = self.label_list.index(label)
            vector[index]+=1
        return ohe_matrix.astype(int)

In [135]:
def read_dataset(feature_file,label_file):
    df_X = pd.read_csv(feature_file)
    df_y = pd.read_csv(label_file)
    X = df_X.values #convert values in dataframe to numpy 2-D array
    y = df_y.values.reshape(-1)   #convert values in dataframe to numpy 1-D array
    return X,y

In [136]:
def normalize_features(X_train,X_test):
    scaler = StandardScaler()
    scaler.fit(X_train)
    return scaler.transform(X_train),scaler.transform(X_test)

In [137]:
def ohe(y_train,y_test):
    ohencoder = one_hot_encoder()
    ohencoder.fit(y_train)
    return ohencoder.transform(y_train),ohencoder.transform(y_test)

In [138]:
def softmax(z):
    exp_value = np.exp(z-np.amax(z, axis=1, keepdims=True)) # for stablility
    # keepdims = True means that the output's dimension is the same as of z
    softmax_scores = exp_value / np.sum(exp_value, axis=1, keepdims=True)
    return softmax_scores

In [139]:
def accuracy(ypred, yexact):
    p = np.array(ypred == yexact, dtype = int)
    return np.sum(p)/float(len(yexact))

In [140]:
class twolayer_NN:
    def __init__(self, X, y, hidden_layer_nn_0=100, hidden_layer_nn_1=100, lr=0.01):
        self.X = X # features
        self.y = y # labels (targets) in one-hot-encoder
        self.lr = lr # learning rate
        # Initialize weights
        self.nn = X.shape[1] # number of neurons in the input layer

        self.Wp = np.random.randn(self.nn, hidden_layer_nn_0) / np.sqrt(self.nn)
        self.bp = np.zeros((1, hidden_layer_nn_0)) 
        
        self.W0 = np.random.randn(hidden_layer_nn_0, hidden_layer_nn_1) / np.sqrt(hidden_layer_nn_0)
        self.b0 = np.zeros((1, hidden_layer_nn_1)) 
        
        self.output_layer_nn = y.shape[1]
        self.Wq = np.random.randn(hidden_layer_nn_1, self.output_layer_nn) / np.sqrt(hidden_layer_nn_1)
        self.bq = np.zeros((1, self.output_layer_nn))      
        
    def feed_forward(self):
        # hidden layer
        ## z_0 = XW_p + b_p
        self.z0 = np.dot(self.X, self.Wp) + self.bp
        ## activation function :  f_0 = \tanh(z_0)
        self.f0 = np.tanh(self.z0)
        ## z_1 = f_0W_0 + b_0
        self.z1 = np.dot(self.f0, self.W0) + self.b0
        ## activation function :  f_1 = \tanh(z_1)
        self.f1 = np.tanh(self.z1)
        
        # output layer
        ## z_q = f_1W_q + b_q
        self.zq = np.dot(self.f1, self.Wq) + self.bq   
        #\hat{y} = softmax}(z_q)$
        self.y_hat = softmax(self.zq)
        
    def back_propagation(self):
        # $d_3 = \hat{y}-y$
        dq = self.y_hat - self.y
        # d_2 = (1-f^2_2)*(\hat{y}-y)W_3^T
        d0 = (1-self.f1*self.f1)*(dq.dot((self.Wq).T))
        dp = (1-self.f0*self.f0)*(d0.dot((self.W0).T))
        
        # dL/dWq = f_1^T d3
        dWq = np.dot(self.f1.T, dq)
        # dL/dbq = sum(dq,axis=0)
        dbq = np.sum(dq, axis=0, keepdims=True)
        
        # dL/dW0 = f_0^T d_0
        dW0 = np.dot(self.f0.T, d0)
        # dL/b_0 = sum(d_0,axis=0)
        db0 = np.sum(d0, axis=0, keepdims=True)
        # axis =0 : sum along the vertical axis

        # dL/dW_1} = x^T d_1
        dWp = np.dot((self.X).T, dp)
        # dL/db_1 = d_1.axis=0
        dbp = np.sum(dp, axis=0, keepdims=True)
        
        # Update the gradident descent
        self.Wp = self.Wp - self.lr * dWp
        self.bp = self.bp - self.lr * dbp
        self.W0 = self.W0 - self.lr * dW0
        self.b0 = self.b0 - self.lr * db0
        self.Wq = self.Wq - self.lr * dWq
        self.bq = self.bq - self.lr * dbq
        
    def cross_entropy_loss(self):
        #  $L = -\sum_n\sum_{i\in C} y_{n, i}\log(\hat{y}_{n, i})$
        # calculate y_hat
        self.feed_forward()
        self.loss = -np.sum(self.y*np.log(self.y_hat + 1e-6))
        
    def predict(self, X_test):
        # Use feed forward to calculat y_hat_test
        # hidden layer
        ## z_p = XW_p + b_p
        z0 = np.dot(X_test, self.Wp) + self.bp
        ## activation function :  f_0 = \tanh(z_0)
        f0 = np.tanh(z0)
        ## z_1 = f_0W_0 + b_0
        z1 = np.dot(f0, self.W0) + self.b0
        f1 = np.tanh(z1)
        # output layer
        ## z_q = f_1W_q + b_q
        zq = np.dot(f1, self.Wq) + self.bq    
        #\hat{y} = softmax(z_q)$
        y_hat_test = softmax(zq)
        # the rest is similar to the logistic regression
        labels = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
        num_test_samples = X_test.shape[0]
        # find which index gives us the highest probability
        ypred = np.zeros(num_test_samples, dtype=int) 
        for i in range(num_test_samples):
            ypred[i] = labels[np.argmax(y_hat_test[i,:])]
        return ypred

In [141]:
X_train, y_train = read_dataset('MNIST_X_train.csv', 'MNIST_y_train.csv')
X_test, y_test = read_dataset('MNIST_X_test.csv', 'MNIST_y_test.csv')

In [148]:

X_train_norm, X_test_norm = normalize_features(X_train, X_test)
y_train_ohe, y_test_ohe = ohe(y_train, y_test)
# 
myNN = twolayer_NN(X_train_norm, y_train_ohe, hidden_layer_nn_0=300, hidden_layer_nn_1=100, lr=0.001)  
epoch_num = 200
for i in range(epoch_num):
    myNN.feed_forward()
    myNN.back_propagation()
    myNN.cross_entropy_loss()
    if ((i+1)%20 == 0):
        print('epoch = %d, current loss = %.5f' % (i+1, myNN.loss))         
        
y_pred = myNN.predict(X_test_norm)
print('Accuracy of our model ', accuracy(y_pred, y_test.ravel()))

epoch = 20, current loss = 162.89495
epoch = 40, current loss = 48.29837
epoch = 60, current loss = 22.92000
epoch = 80, current loss = 14.14362
epoch = 100, current loss = 9.99184
epoch = 120, current loss = 7.62937
epoch = 140, current loss = 6.12242
epoch = 160, current loss = 5.08525
epoch = 180, current loss = 4.33173
epoch = 200, current loss = 3.76168
Accuracy of our model  0.896
