# COMS 4995_002 Deep Learning Assignment 1
Due on Monday, Oct 9, 11:59pm

This assignment can be done in groups of at most 3 students. Everyone must submit on Courseworks individually.

Write down the UNIs of your group (if applicable)

Member 1: Kaho Chan, kc3137

Member 2: Yu Wang, yw3025

Member 3: Jingxi Xu, jx2324

In [2]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import scipy.misc
import glob
import sys
# you shouldn't need to make any more imports

In [3]:
class NeuralNetwork(object):
    """
    Abstraction of neural network.
    Stores parameters, activations, cached values. 
    Provides necessary functions for training and prediction. 
    """
    def __init__(self, layer_dimensions, drop_prob=0.0, reg_lambda=0.0):
        """
        Initializes the weights and biases for each layer
        :param layer_dimensions: (list) number of nodes in each layer
        :param drop_prob: drop probability for dropout layers. Only required in part 2 of the assignment
        :param reg_lambda: regularization parameter. Only required in part 2 of the assignment
        """
        np.random.seed(1)
        
        self.parameters = {}
        self.num_layers = len(layer_dimensions)
        self.drop_prob = drop_prob
        self.reg_lambda = reg_lambda
        self.status = 'train'
        self.X_val = None
        self.y_val = None
        
        # init parameters
        for i in range(1, self.num_layers):
#             self.parameters[('W', i)] = np.random.normal(0, 1, (layer_dimensions[i], layer_dimensions[i - 1]))
#             self.parameters[('W', i)] /= np.sqrt(layer_dimensions[i - 1])
            self.parameters[('W', i)] = np.random.randn(layer_dimensions[i], layer_dimensions[i - 1]) / np.sqrt(layer_dimensions[i - 1])
            self.parameters[('b', i)] = np.zeros([layer_dimensions[i], 1])

    def affineForward(self, A, W, b):
        """
        Forward pass for the affine layer.
        :param A: input matrix, shape (L, S), where L is the number of hidden units in the previous layer and S is
        the number of samples
        :returns: the affine product WA + b, along with the cache required for the backward pass
        """
#         M = None
#         if self.status == 'train' and self.drop_prob > 0:
#             # dropout only when training
#             A, M = self.dropout(A, self.drop_prob)
        Z = np.dot(W, A) + b
        cache = (A, W, b, Z)
        return Z, cache

    def activationForward(self, A, activation="relu"):
        """
        Common interface to access all activation functions.
        :param A: input to the activation function
        :param prob: activation funciton to apply to A. Just "relu" for this assignment.
        :returns: activation(A)
        """ 
        return self.relu(A)

    def relu(self, X):
        A = np.maximum(0, X)
        assert (X.shape == A.shape)
        return A
            
    def dropout(self, A, prob):
        """
        :param A: 
        :param prob: drop prob
        :returns: tuple (A, M) 
            WHERE
            A is matrix after applying dropout
            M is dropout mask, used in the backward pass
        """
        M = np.random.uniform(size = A.shape)
        M = (M > prob) / (1 - prob)
        A = A * M
        return A, M

    def forwardPropagation(self, X):
        """
        Runs an input X through the neural network to compute activations
        for all layers. Returns the output computed at the last layer along
        with the cache required for backpropagation.
        :returns: (tuple) AL, cache
            WHERE 
            AL is activation of last layer
            cache is cached values for each layer that
                     are needed in further steps
        """
        
        cache = {}
        A = X
        for l in range(1, self.num_layers):
            A, cache_l = self.affineForward(A,
                                            self.parameters[('W', l)], 
                                            self.parameters[('b', l)])
            
            if (l != self.num_layers - 1):
                # don't do relu on output layer
                A = self.activationForward(A)
            M = None
            if self.status == 'train' and self.drop_prob > 0 and l != self.num_layers - 1:
            # dropout only when training
                A, M = self.dropout(A, self.drop_prob)
            cache[l] = cache_l + (M,)
#             print (len(cache[l]))
        # return AL, cache
        return A, cache
    
    def costFunction(self, AL, y):
        """
        :param AL: Activation of last layer, shape (num_classes, S)
        :param y: labels, shape (S)
        :param alpha: regularization parameter
        :returns cost, dAL: A scalar denoting cost and the gradient of cost
        """
        assert (self.status == 'train')
        # compute loss
        S = AL.shape[1]
        probs = np.exp(AL - np.max(AL, axis=0, keepdims=True))
        probs /= np.sum(probs, axis=0, keepdims=True)
        cost = -np.sum(np.log(probs[y, np.arange(S)])) / S
        
        if self.reg_lambda > 0:
            # add regularization
            for l in range(1, self.num_layers):
                cost += 0.5 * self.reg_lambda * np.sum(np.power(self.parameters[('W', l)], 2))
        dAL = probs.copy()
        dAL[y, np.arange(S)] -= 1
        dAL /= S
        return cost, dAL

    def affineBackward(self, dA_prev, cache):
        """
        Backward pass for the affine layer.
        :param dA_prev: gradient from the next layer.
        :param cache: cache returned in affineForward
        :returns dA: gradient on the input to this layer
                 dW: gradient on the weights
                 db: gradient on the bias
        """
#        A = cache[0]
#        W = cache[1]
#        b = cache[2]
        A, W, b, Z, M = cache
        S = A.shape[1]
        
#         dZ = self.activationBackward(dA_prev, cache)
        dA = np.dot(W.T, dA_prev)
        dW = np.dot(dA_prev, A.T) / S
        db = np.sum(dA_prev, axis = 1, keepdims=True) / S # or np.mean() ?
        # dA = W.T.dot(dA_prev) 
        # dW = dA_prev.dot(A.T) / S
        # db = np.sum(dA_pre, axis = 1) / S
        return dA, dW, db
    
    def activationBackward(self, dA, cache, activation="relu"):
        """
        Interface to call backward on activation functions.
        In this case, it's just relu. 
        """
        return self.relu_derivative(dA, cache[3]) # cache[3] == Z[l]
        
    def relu_derivative(self, dx, cached_x):
        out = np.maximum(0, cached_x)
        out[out > 0] = 1
        dx = out * dx
        return dx

    def dropout_backward(self, dA, cache):
        A, W, b, Z, M = cache
        return dA * M # cache[4] = Mdropout_backward

    def backPropagation(self, dAL, Y, cache):
        """
        Run backpropagation to compute gradients on all paramters in the model
        :param dAL: gradient on the last layer of the network. Returned by the cost function.
        :param Y: labels
        :param cache: cached values during forwardprop
        :returns gradients: dW and db for each weight/bias
        """
        assert (self.status == 'train')
        gradients = {}
        dA = dAL
        for l in range(self.num_layers-1, 0, -1):
            if self.drop_prob > 0 and l != self.num_layers - 1:
                dA = self.dropout_backward(dA, cache[l])
            if l != self.num_layers - 1:
                dA = self.activationBackward(dA, cache[l])
            dA, dW, db = self.affineBackward(dA, cache[l])
            # assert (dW.shape == self.parameters[('W', l)].shape), '{} - {}'.format(dW.shape, self.parameters[('W', l)].shape)
            # assert (db.shape == self.parameters[('b', l)].shape), '{} - {}'.format(db.shape, self.parameters[('b', l)].shape)
            gradients[('W', l)] = dW
            # gradients[('b', l)] = db # should b be regularized ?
            if self.reg_lambda > 0:
                # add gradients from L2 regularization to each dW
                gradients[('W', l)] += self.reg_lambda * self.parameters[('W', l)]
               #  gradients[('b', l)] += reg_lambda * self.parameters[('b', l)]
        return gradients

    def updateParameters(self, gradients, alpha):
        """
        :param gradients: gradients for each weight/bias
        :param alpha: step size for gradient descent 
        """
        for key in gradients.keys():
            self.parameters[key] -= alpha * gradients[key]
            
    def train(self, X, y, iters=1000, alpha=0.0001, batch_size=100, print_every=100):
        """
        :param X: input samples, each column is a sample
        :param y: labels for input samples, y.shape[0] must equal X.shape[1]
        :param iters: number of training iterations
        :param alpha: step size for gradient descent
        :param batch_size: number of samples in a minibatch
        :param print_every: no. of iterations to print debug info after
        """
        assert (alpha * self.reg_lambda < 1)
        self.status = 'train'
        self.parameters['mean'] = np.mean(X, axis = 1, keepdims = True)
        self.parameters['var'] = np.var(X, axis = 1, keepdims = True)
        X = (X - self.parameters['mean']) / np.sqrt(self.parameters['var'])
        for i in range(0, iters):
            # get minibatch
            X_batch, y_batch = self.get_batch(X, y, batch_size)
            # forward prop
            AL, cache = self.forwardPropagation(X_batch)
            # compute loss
            cost, dAL = self.costFunction(AL, y_batch)
            # compute gradients
            gradients = self.backPropagation(dAL, y_batch, cache)
            # update weights and biases based on gradient
            self.updateParameters(gradients, alpha)
            if i % print_every == 0:
                # print cost, train and validation set accuracies
                trn_acc = self.score(self.predict(X), y)
                if self.X_val is not None:
                    val_acc = self.score(self.predict(self.X_val), self.y_val)
                else:
                    val_acc = np.nan
                print('iter={:5}, cost={:.4f}, trn_acc={:.4f}, val_acc={:.4f}'.format(i, cost, trn_acc, val_acc))
                
    def predict(self, X):
        """
        Make predictions for each sample
        """
        self.status = 'predict'
        X = (X - self.parameters['mean']) / np.sqrt(self.parameters['var'])
        AL, _ = self.forwardPropagation(X)
        y_pred = np.argmax(AL, axis = 0)
        self.status = 'train'
        return y_pred

    def get_batch(self, X, y, batch_size):
        """
        Return minibatch of samples and labels
        
        :param X, y: samples and corresponding labels
        :parma batch_size: minibatch size
        :returns: (tuple) X_batch, y_batch
        """
        batch_idx = np.random.randint(X.shape[1], size = batch_size)
        X_batch = X[:, batch_idx]
        y_batch = y[batch_idx]
#         print y_batch.shape
        return X_batch, y_batch
    
    def score(self, y_pred, y_gold):
        return np.mean(y_pred == y_gold)
    
    def load_validation_set(self, X_val, y_val):
        self.X_val = X_val
        self.y_val = y_val

In [4]:
# Helper functions, DO NOT modify this
def get_img_array(path):
    """
    Given path of image, returns it's numpy array
    """
    return scipy.misc.imread(path)

def get_files(folder):
    """
    Given path to folder, returns list of files in it
    """
    filenames = [file for file in glob.glob(folder+'*/*')]
    filenames.sort()
    return filenames

def get_label(filepath, label2id):
    """
    Files are assumed to be labeled as: /path/to/file/999_frog.png
    Returns label for a filepath
    """
    tokens = filepath.split('/')
    label = tokens[-1].split('_')[1][:-4]
    if label in label2id:
        return label2id[label]
    else:
        sys.exit("Invalid label: " + label)

In [5]:
# Functions to load data, DO NOT change these

def get_labels(folder, label2id):
    """
    Returns vector of labels extracted from filenames of all files in folder
    :param folder: path to data folder
    :param label2id: mapping of text labels to numeric ids. (Eg: automobile -> 0)
    """
    files = get_files(folder)
    y = []
    for f in files:
        y.append(get_label(f,label2id))
    return np.array(y)

def one_hot(y, num_classes=10):
    """
    Converts each label index in y to vector with one_hot encoding
    """
    y_one_hot = np.zeros((y.shape[0], num_classes))
    y_one_hot[y] = 1
    return y_one_hot.T

def get_label_mapping(label_file):
    """
    Returns mappings of label to index and index to label
    The input file has list of labels, each on a separate line.
    """
    with open(label_file, 'r') as f:
        id2label = f.readlines()
        id2label = [l.strip() for l in id2label]
    label2id = {}
    count = 0
    for label in id2label:
        label2id[label] = count
        count += 1
    return id2label, label2id

def get_images(folder):
    """
    returns numpy array of all samples in folder
    each column is a sample resized to 30x30 and flattened
    """
    files = get_files(folder)
    images = []
    count = 0
    
    for f in files:
        count += 1
        if count % 10000 == 0:
            print("Loaded {}/{}".format(count,len(files)))
        img_arr = get_img_array(f)
        img_arr = img_arr.flatten() / 255.0
        images.append(img_arr)
    X = np.column_stack(images)

    return X

def get_train_data(data_root_path):
    """
    Return X and y
    """
    train_data_path = data_root_path + 'train'
    id2label, label2id = get_label_mapping(data_root_path+'labels.txt')
    print(label2id)
    X = get_images(train_data_path)
    y = get_labels(train_data_path, label2id)
    return X, y

def save_predictions(filename, y):
    """
    Dumps y into .npy file
    """
    np.save(filename, y)

In [6]:
# Load the data
data_root_path = 'cifar10-hw1/'
X_train, y_train = get_train_data(data_root_path) # this may take a few minutes
X_test = get_images(data_root_path + 'test')
print('Data loading done')

{'airplane': 0, 'automobile': 1, 'bird': 2, 'cat': 3, 'deer': 4, 'dog': 5, 'frog': 6, 'horse': 7, 'ship': 8, 'truck': 9}
Loaded 10000/50000
Loaded 20000/50000
Loaded 30000/50000
Loaded 40000/50000
Loaded 50000/50000
Loaded 10000/10000
Data loading done


In [7]:
print(X_train.shape)
print(y_train.shape)
print(y_train[:5])

(3072, 50000)
(50000,)
[6 1 6 6 8]


In [None]:
# Group's helper function
def split(X, y, test_size):
    '''
    split the data into training and validation set
    '''
    indices = np.random.permutation(X.shape[1])
    test_num = int(test_size * X.shape[1])
    return X[:, indices[test_num:]], X[:, indices[:test_num]], y[indices[test_num:]], y[indices[:test_num]]

#### Experiment on manually splitted train/validation (trn/val)

In [10]:
X_trn, X_val, y_trn, y_val = split(X_train, y_train, test_size=0.1)

In [11]:
layer_dimensions = [X_train.shape[0], 1000, 200, 50, 10]
NN = NeuralNetwork(layer_dimensions)
NN.load_validation_set(X_val, y_val)
NN.train(X_trn, y_trn, iters=5000, alpha=1, batch_size=100, print_every=100)

iter=    0, cost=2.2772, trn_acc=0.1072, val_acc=0.1078
iter=  100, cost=2.0054, trn_acc=0.2676, val_acc=0.2930
iter=  200, cost=1.9131, trn_acc=0.3277, val_acc=0.3570
iter=  300, cost=1.8278, trn_acc=0.3623, val_acc=0.3826
iter=  400, cost=1.7765, trn_acc=0.3831, val_acc=0.4042
iter=  500, cost=1.6021, trn_acc=0.4001, val_acc=0.4094
iter=  600, cost=1.6736, trn_acc=0.4052, val_acc=0.4276
iter=  700, cost=1.6786, trn_acc=0.4252, val_acc=0.4376
iter=  800, cost=1.5087, trn_acc=0.4409, val_acc=0.4500
iter=  900, cost=1.3721, trn_acc=0.4405, val_acc=0.4526
iter= 1000, cost=1.2821, trn_acc=0.4468, val_acc=0.4560
iter= 1100, cost=1.5660, trn_acc=0.4603, val_acc=0.4604
iter= 1200, cost=1.3536, trn_acc=0.4684, val_acc=0.4716
iter= 1300, cost=1.4831, trn_acc=0.4729, val_acc=0.4722
iter= 1400, cost=1.4379, trn_acc=0.4877, val_acc=0.4792
iter= 1500, cost=1.2795, trn_acc=0.4767, val_acc=0.4762
iter= 1600, cost=1.4163, trn_acc=0.4880, val_acc=0.4754
iter= 1700, cost=1.2399, trn_acc=0.5050, val_acc

In [15]:
# check if iterations are not enough: the answer is that 5000 is enough
layer_dimensions = [X_train.shape[0], 1000, 200, 50, 10]
NN = NeuralNetwork(layer_dimensions)
NN.load_validation_set(X_val, y_val)
NN.train(X_trn, y_trn, iters=10000, alpha=1, batch_size=100, print_every=100)

iter=    0, cost=2.2772, trn_acc=0.1072, val_acc=0.1078
iter=  100, cost=2.0054, trn_acc=0.2676, val_acc=0.2930
iter=  200, cost=1.9131, trn_acc=0.3277, val_acc=0.3570
iter=  300, cost=1.8278, trn_acc=0.3623, val_acc=0.3826
iter=  400, cost=1.7765, trn_acc=0.3831, val_acc=0.4042
iter=  500, cost=1.6021, trn_acc=0.4001, val_acc=0.4094
iter=  600, cost=1.6736, trn_acc=0.4052, val_acc=0.4276
iter=  700, cost=1.6786, trn_acc=0.4252, val_acc=0.4376
iter=  800, cost=1.5087, trn_acc=0.4409, val_acc=0.4500
iter=  900, cost=1.3721, trn_acc=0.4405, val_acc=0.4526
iter= 1000, cost=1.2821, trn_acc=0.4468, val_acc=0.4560
iter= 1100, cost=1.5660, trn_acc=0.4603, val_acc=0.4604
iter= 1200, cost=1.3536, trn_acc=0.4684, val_acc=0.4716
iter= 1300, cost=1.4831, trn_acc=0.4729, val_acc=0.4722
iter= 1400, cost=1.4379, trn_acc=0.4877, val_acc=0.4792
iter= 1500, cost=1.2795, trn_acc=0.4767, val_acc=0.4762
iter= 1600, cost=1.4163, trn_acc=0.4880, val_acc=0.4754
iter= 1700, cost=1.2399, trn_acc=0.5050, val_acc

## Part 1

#### Simple fully-connected deep neural network
#### Final output for the given test set (train/test)

In [12]:
layer_dimensions = [X_train.shape[0], 1000, 200, 50, 10]  # including the input and output layers
NN = NeuralNetwork(layer_dimensions)
NN.train(X_train, y_train, iters=5000, alpha=1, batch_size=100, print_every=100)

iter=    0, cost=2.3859, trn_acc=0.1160, val_acc=nan
iter=  100, cost=2.0567, trn_acc=0.2779, val_acc=nan
iter=  200, cost=1.9116, trn_acc=0.3363, val_acc=nan
iter=  300, cost=1.7787, trn_acc=0.3665, val_acc=nan
iter=  400, cost=1.6508, trn_acc=0.3790, val_acc=nan
iter=  500, cost=1.7339, trn_acc=0.3990, val_acc=nan
iter=  600, cost=1.4921, trn_acc=0.4213, val_acc=nan
iter=  700, cost=1.6661, trn_acc=0.4277, val_acc=nan
iter=  800, cost=1.6047, trn_acc=0.4303, val_acc=nan
iter=  900, cost=1.5008, trn_acc=0.4422, val_acc=nan
iter= 1000, cost=1.4384, trn_acc=0.4428, val_acc=nan
iter= 1100, cost=1.4229, trn_acc=0.4596, val_acc=nan
iter= 1200, cost=1.4369, trn_acc=0.4653, val_acc=nan
iter= 1300, cost=1.4385, trn_acc=0.4718, val_acc=nan
iter= 1400, cost=1.5543, trn_acc=0.4750, val_acc=nan
iter= 1500, cost=1.1944, trn_acc=0.4910, val_acc=nan
iter= 1600, cost=1.3153, trn_acc=0.4953, val_acc=nan
iter= 1700, cost=1.4025, trn_acc=0.4898, val_acc=nan
iter= 1800, cost=1.4529, trn_acc=0.5029, val_a

In [13]:
y_predicted = NN.predict(X_test)
save_predictions('ans1-uni', y_predicted)

In [14]:
# test if your numpy file has been saved correctly
loaded_y = np.load('ans1-uni.npy')
print(loaded_y.shape)
loaded_y[:10]

(10000,)


array([3, 8, 0, 4, 5, 3, 8, 4, 8, 1], dtype=int64)

#### Experiments for dropout

In [16]:
X_trn, X_val, y_trn, y_val = split(X_train, y_train, test_size=0.1)

In [17]:
layer_dimensions = [X_train.shape[0], 1000, 200, 50, 10]
NN = NeuralNetwork(layer_dimensions, drop_prob=0.065, reg_lambda=0.0)
NN.load_validation_set(X_val, y_val)
# 4900 iter to 0.5194
NN.train(X_trn, y_trn, iters=10000, alpha=1, batch_size=100, print_every=100)

iter=    0, cost=2.3254, trn_acc=0.1193, val_acc=0.1128
iter=  100, cost=1.9503, trn_acc=0.2760, val_acc=0.2888
iter=  200, cost=1.8306, trn_acc=0.3285, val_acc=0.3382
iter=  300, cost=1.7726, trn_acc=0.3604, val_acc=0.3738
iter=  400, cost=1.8955, trn_acc=0.3877, val_acc=0.3980
iter=  500, cost=1.5365, trn_acc=0.3962, val_acc=0.4088
iter=  600, cost=1.5045, trn_acc=0.4075, val_acc=0.4202
iter=  700, cost=1.6285, trn_acc=0.4246, val_acc=0.4270
iter=  800, cost=1.6281, trn_acc=0.4379, val_acc=0.4342
iter=  900, cost=1.5433, trn_acc=0.4473, val_acc=0.4438
iter= 1000, cost=1.5082, trn_acc=0.4448, val_acc=0.4472
iter= 1100, cost=1.5041, trn_acc=0.4607, val_acc=0.4446
iter= 1200, cost=1.4156, trn_acc=0.4610, val_acc=0.4536
iter= 1300, cost=1.5625, trn_acc=0.4747, val_acc=0.4592
iter= 1400, cost=1.4482, trn_acc=0.4744, val_acc=0.4562
iter= 1500, cost=1.5898, trn_acc=0.4816, val_acc=0.4624
iter= 1600, cost=1.3320, trn_acc=0.4895, val_acc=0.4696
iter= 1700, cost=1.5738, trn_acc=0.4896, val_acc

In [18]:
layer_dimensions = [X_train.shape[0], 1000, 200, 50, 10]
NN = NeuralNetwork(layer_dimensions, drop_prob=0.5, reg_lambda=0.0)
NN.load_validation_set(X_val, y_val)
NN.train(X_trn, y_trn, iters=6000, alpha=1, batch_size=100, print_every=100)

iter=    0, cost=2.7273, trn_acc=0.1178, val_acc=0.1128
iter=  100, cost=2.2517, trn_acc=0.2331, val_acc=0.2386
iter=  200, cost=2.2968, trn_acc=0.2484, val_acc=0.2542
iter=  300, cost=2.1033, trn_acc=0.2768, val_acc=0.2814
iter=  400, cost=2.1803, trn_acc=0.2820, val_acc=0.2996
iter=  500, cost=2.0181, trn_acc=0.3000, val_acc=0.3136
iter=  600, cost=2.0213, trn_acc=0.3160, val_acc=0.3292
iter=  700, cost=2.0101, trn_acc=0.3249, val_acc=0.3348
iter=  800, cost=2.1024, trn_acc=0.3404, val_acc=0.3432
iter=  900, cost=2.0388, trn_acc=0.3476, val_acc=0.3508
iter= 1000, cost=1.9798, trn_acc=0.3542, val_acc=0.3602
iter= 1100, cost=1.9242, trn_acc=0.3546, val_acc=0.3582
iter= 1200, cost=1.9046, trn_acc=0.3685, val_acc=0.3710
iter= 1300, cost=1.9298, trn_acc=0.3683, val_acc=0.3746
iter= 1400, cost=1.9494, trn_acc=0.3704, val_acc=0.3736
iter= 1500, cost=2.0656, trn_acc=0.3769, val_acc=0.3800
iter= 1600, cost=1.8559, trn_acc=0.3782, val_acc=0.3822
iter= 1700, cost=1.8782, trn_acc=0.3869, val_acc

In [33]:
layer_dimensions = [X_train.shape[0], 1000, 200, 50, 10]
NN = NeuralNetwork(layer_dimensions, drop_prob=0.1, reg_lambda=0.0)
NN.load_validation_set(X_val, y_val)
NN.train(X_trn, y_trn, iters=10000, alpha=1, batch_size=100, print_every=100)

iter=    0, cost=2.3416, trn_acc=0.1182, val_acc=0.1112
iter=  100, cost=1.9510, trn_acc=0.2706, val_acc=0.2850
iter=  200, cost=1.9061, trn_acc=0.3251, val_acc=0.3370
iter=  300, cost=1.8005, trn_acc=0.3552, val_acc=0.3670
iter=  400, cost=1.9061, trn_acc=0.3828, val_acc=0.3928
iter=  500, cost=1.5840, trn_acc=0.3941, val_acc=0.4026
iter=  600, cost=1.5878, trn_acc=0.3991, val_acc=0.4138
iter=  700, cost=1.6535, trn_acc=0.4186, val_acc=0.4228
iter=  800, cost=1.6627, trn_acc=0.4275, val_acc=0.4340
iter=  900, cost=1.5304, trn_acc=0.4399, val_acc=0.4390
iter= 1000, cost=1.5391, trn_acc=0.4386, val_acc=0.4432
iter= 1100, cost=1.5205, trn_acc=0.4537, val_acc=0.4476
iter= 1200, cost=1.4340, trn_acc=0.4537, val_acc=0.4508
iter= 1300, cost=1.5837, trn_acc=0.4672, val_acc=0.4554
iter= 1400, cost=1.4943, trn_acc=0.4695, val_acc=0.4560
iter= 1500, cost=1.6310, trn_acc=0.4756, val_acc=0.4550
iter= 1600, cost=1.4170, trn_acc=0.4814, val_acc=0.4638
iter= 1700, cost=1.5872, trn_acc=0.4829, val_acc

#### Test for L2 regularization
0.05, 0.005, 0.001 all too large, 10\*\*(-4) seems acceptable

In [35]:
# Test the L2 Regularization: 0.05 is too large
layer_dimensions = [X_train.shape[0], 1000, 200, 50, 10]
NN = NeuralNetwork(layer_dimensions, drop_prob=0.0, reg_lambda=0.05)
NN.load_validation_set(X_val, y_val)
NN.train(X_trn, y_trn, iters=1000, alpha=1, batch_size=100, print_every=100)

iter=    0, cost=33.8131, trn_acc=0.1192, val_acc=0.1136
iter=  100, cost=2.3037, trn_acc=0.1536, val_acc=0.1702
iter=  200, cost=2.3026, trn_acc=0.1536, val_acc=0.1702
iter=  300, cost=2.3026, trn_acc=0.1536, val_acc=0.1702
iter=  400, cost=2.3026, trn_acc=0.1536, val_acc=0.1702
iter=  500, cost=2.3026, trn_acc=0.1536, val_acc=0.1702
iter=  600, cost=2.3026, trn_acc=0.1536, val_acc=0.1702
iter=  700, cost=2.3026, trn_acc=0.1536, val_acc=0.1702
iter=  800, cost=2.3026, trn_acc=0.1536, val_acc=0.1702
iter=  900, cost=2.3026, trn_acc=0.1536, val_acc=0.1702


In [36]:
layer_dimensions = [X_train.shape[0], 1000, 200, 50, 10]
NN = NeuralNetwork(layer_dimensions, drop_prob=0.0, reg_lambda=0.005)
NN.load_validation_set(X_val, y_val)
NN.train(X_trn, y_trn, iters=5000, alpha=1, batch_size=100, print_every=100)

iter=    0, cost=5.4393, trn_acc=0.1190, val_acc=0.1136
iter=  100, cost=3.3789, trn_acc=0.2213, val_acc=0.2378
iter=  200, cost=2.6995, trn_acc=0.2178, val_acc=0.2162
iter=  300, cost=2.4548, trn_acc=0.2141, val_acc=0.2016
iter=  400, cost=2.3591, trn_acc=0.2107, val_acc=0.1942
iter=  500, cost=2.3235, trn_acc=0.2093, val_acc=0.1908
iter=  600, cost=2.3103, trn_acc=0.2088, val_acc=0.1898
iter=  700, cost=2.3054, trn_acc=0.2087, val_acc=0.1894
iter=  800, cost=2.3036, trn_acc=0.2087, val_acc=0.1894
iter=  900, cost=2.3030, trn_acc=0.2086, val_acc=0.1896
iter= 1000, cost=2.3027, trn_acc=0.2086, val_acc=0.1896
iter= 1100, cost=2.3026, trn_acc=0.2086, val_acc=0.1896
iter= 1200, cost=2.3026, trn_acc=0.2086, val_acc=0.1896
iter= 1300, cost=2.3026, trn_acc=0.2086, val_acc=0.1896
iter= 1400, cost=2.3026, trn_acc=0.2086, val_acc=0.1896
iter= 1500, cost=2.3026, trn_acc=0.2086, val_acc=0.1896
iter= 1600, cost=2.3026, trn_acc=0.2086, val_acc=0.1896


KeyboardInterrupt: 

In [37]:
layer_dimensions = [X_train.shape[0], 1000, 200, 50, 10]
NN = NeuralNetwork(layer_dimensions, drop_prob=0.0, reg_lambda=0.001)
NN.load_validation_set(X_val, y_val)
NN.train(X_trn, y_trn, iters=5000, alpha=1, batch_size=100, print_every=100)

iter=    0, cost=2.9172, trn_acc=0.1189, val_acc=0.1134
iter=  100, cost=2.5606, trn_acc=0.2551, val_acc=0.2758
iter=  200, cost=2.4062, trn_acc=0.2928, val_acc=0.3066
iter=  300, cost=2.3148, trn_acc=0.3274, val_acc=0.3412
iter=  400, cost=2.2519, trn_acc=0.3356, val_acc=0.3496
iter=  500, cost=2.0762, trn_acc=0.3445, val_acc=0.3616
iter=  600, cost=1.9950, trn_acc=0.3412, val_acc=0.3622
iter=  700, cost=1.9583, trn_acc=0.3463, val_acc=0.3540
iter=  800, cost=1.9567, trn_acc=0.3316, val_acc=0.3538
iter=  900, cost=1.9653, trn_acc=0.3269, val_acc=0.3472
iter= 1000, cost=1.9317, trn_acc=0.3341, val_acc=0.3476
iter= 1100, cost=2.0324, trn_acc=0.3305, val_acc=0.3454
iter= 1200, cost=1.9130, trn_acc=0.3188, val_acc=0.3436
iter= 1300, cost=1.9648, trn_acc=0.3272, val_acc=0.3428
iter= 1400, cost=1.8840, trn_acc=0.3215, val_acc=0.3382
iter= 1500, cost=1.8023, trn_acc=0.3297, val_acc=0.3370
iter= 1600, cost=1.8596, trn_acc=0.3266, val_acc=0.3370
iter= 1700, cost=1.8643, trn_acc=0.3107, val_acc

KeyboardInterrupt: 

In [40]:
layer_dimensions = [X_train.shape[0], 1000, 200, 50, 10]
NN = NeuralNetwork(layer_dimensions, drop_prob=0.0, reg_lambda=10**(-4))
NN.load_validation_set(X_val, y_val)
NN.train(X_trn, y_trn, iters=10001, alpha=1, batch_size=100, print_every=100)

iter=    0, cost=2.3497, trn_acc=0.1189, val_acc=0.1134
iter=  100, cost=2.0610, trn_acc=0.2643, val_acc=0.2826
iter=  200, cost=1.9572, trn_acc=0.3190, val_acc=0.3388
iter=  300, cost=1.8653, trn_acc=0.3663, val_acc=0.3794
iter=  400, cost=1.8422, trn_acc=0.3743, val_acc=0.3964
iter=  500, cost=1.6808, trn_acc=0.3976, val_acc=0.4142
iter=  600, cost=1.6373, trn_acc=0.4057, val_acc=0.4222
iter=  700, cost=1.5681, trn_acc=0.4219, val_acc=0.4256
iter=  800, cost=1.5769, trn_acc=0.4282, val_acc=0.4390
iter=  900, cost=1.5736, trn_acc=0.4295, val_acc=0.4396
iter= 1000, cost=1.4976, trn_acc=0.4476, val_acc=0.4416
iter= 1100, cost=1.5790, trn_acc=0.4341, val_acc=0.4518
iter= 1200, cost=1.5367, trn_acc=0.4570, val_acc=0.4542
iter= 1300, cost=1.5746, trn_acc=0.4614, val_acc=0.4640
iter= 1400, cost=1.4248, trn_acc=0.4668, val_acc=0.4652
iter= 1500, cost=1.2521, trn_acc=0.4848, val_acc=0.4744
iter= 1600, cost=1.2945, trn_acc=0.4826, val_acc=0.4696
iter= 1700, cost=1.4142, trn_acc=0.4809, val_acc

#### Test for combination of dropout and regularization
I couldn't find a good combination of these two, though they are both effective on themselves.

It takes time to run this (about 30 seconds for 100 iterations on my laptop, so 10000 iterations take around 50 mins), it seems that more iterations could be useful, but the time is limited..

In [42]:
layer_dimensions = [X_train.shape[0], 1000, 200, 50, 10]
NN = NeuralNetwork(layer_dimensions, drop_prob=0.01, reg_lambda=10**(-4))
NN.load_validation_set(X_val, y_val)
NN.train(X_trn, y_trn, iters=10001, alpha=1, batch_size=100, print_every=100)

iter=    0, cost=2.3516, trn_acc=0.1185, val_acc=0.1126
iter=  100, cost=2.0053, trn_acc=0.2785, val_acc=0.2894
iter=  200, cost=1.9014, trn_acc=0.3328, val_acc=0.3410
iter=  300, cost=1.7160, trn_acc=0.3624, val_acc=0.3758
iter=  400, cost=1.8775, trn_acc=0.3804, val_acc=0.3968
iter=  500, cost=1.5603, trn_acc=0.3871, val_acc=0.4084
iter=  600, cost=1.5516, trn_acc=0.4028, val_acc=0.4212
iter=  700, cost=1.6742, trn_acc=0.4220, val_acc=0.4288
iter=  800, cost=1.6435, trn_acc=0.4313, val_acc=0.4328
iter=  900, cost=1.5985, trn_acc=0.4400, val_acc=0.4432
iter= 1000, cost=1.5041, trn_acc=0.4338, val_acc=0.4462
iter= 1100, cost=1.5347, trn_acc=0.4528, val_acc=0.4474
iter= 1200, cost=1.4142, trn_acc=0.4574, val_acc=0.4530
iter= 1300, cost=1.5990, trn_acc=0.4677, val_acc=0.4574
iter= 1400, cost=1.4235, trn_acc=0.4662, val_acc=0.4616
iter= 1500, cost=1.6213, trn_acc=0.4735, val_acc=0.4628
iter= 1600, cost=1.3699, trn_acc=0.4819, val_acc=0.4714
iter= 1700, cost=1.5902, trn_acc=0.4826, val_acc

In [None]:
layer_dimensions = [X_train.shape[0], 1000, 200, 50, 10]
NN = NeuralNetwork(layer_dimensions, drop_prob=0.01, reg_lambda=10**(-4))
NN.load_validation_set(X_val, y_val)
NN.train(X_trn, y_trn, iters=10001, alpha=0.5, batch_size=100, print_every=100)

iter=    0, cost=2.3516, trn_acc=0.1136, val_acc=0.1064
iter=  100, cost=2.1572, trn_acc=0.2329, val_acc=0.2498
iter=  200, cost=2.0785, trn_acc=0.2755, val_acc=0.2880
iter=  300, cost=1.8734, trn_acc=0.3105, val_acc=0.3190
iter=  400, cost=2.0387, trn_acc=0.3337, val_acc=0.3472
iter=  500, cost=1.7582, trn_acc=0.3442, val_acc=0.3654
iter=  600, cost=1.7384, trn_acc=0.3571, val_acc=0.3770
iter=  700, cost=1.7969, trn_acc=0.3752, val_acc=0.3926
iter=  800, cost=1.7700, trn_acc=0.3856, val_acc=0.3996
iter=  900, cost=1.7318, trn_acc=0.4017, val_acc=0.4130
iter= 1000, cost=1.6339, trn_acc=0.3991, val_acc=0.4130


## Part 2: Regularizing the neural network
#### Add dropout and L2 regularization
#### Final output for the given test set

In [44]:
layer_dimensions = [X_train.shape[0], 1000, 200, 50, 10]
NN2 = NeuralNetwork(layer_dimensions, drop_prob=0.1, reg_lambda=0.0)
NN2.train(X_train, y_train, iters=10001, alpha=1, batch_size=100, print_every=100)

iter=    0, cost=2.4009, trn_acc=0.1158, val_acc=nan
iter=  100, cost=2.0661, trn_acc=0.2640, val_acc=nan
iter=  200, cost=1.8798, trn_acc=0.3188, val_acc=nan
iter=  300, cost=1.8643, trn_acc=0.3533, val_acc=nan
iter=  400, cost=1.8440, trn_acc=0.3803, val_acc=nan
iter=  500, cost=1.9582, trn_acc=0.3981, val_acc=nan
iter=  600, cost=1.7163, trn_acc=0.4054, val_acc=nan
iter=  700, cost=1.5828, trn_acc=0.4195, val_acc=nan
iter=  800, cost=1.7700, trn_acc=0.4223, val_acc=nan
iter=  900, cost=1.6074, trn_acc=0.4390, val_acc=nan
iter= 1000, cost=1.6887, trn_acc=0.4416, val_acc=nan
iter= 1100, cost=1.5612, trn_acc=0.4467, val_acc=nan
iter= 1200, cost=1.6045, trn_acc=0.4521, val_acc=nan
iter= 1300, cost=1.4667, trn_acc=0.4591, val_acc=nan
iter= 1400, cost=1.4810, trn_acc=0.4623, val_acc=nan
iter= 1500, cost=1.5924, trn_acc=0.4712, val_acc=nan
iter= 1600, cost=1.5556, trn_acc=0.4766, val_acc=nan
iter= 1700, cost=1.6885, trn_acc=0.4776, val_acc=nan
iter= 1800, cost=1.5614, trn_acc=0.4863, val_a

In [47]:
y_predicted2 = NN2.predict(X_test)
save_predictions('ans2-uni', y_predicted2)

# expression testing

In [25]:
%%time
for _ in range(10**7):
    _2 = 1

Wall time: 496 ms


In [39]:
a = (1, '1')
a

(1, '1')

In [42]:
b = a + (3,)
b

(1, '1', 3)

In [48]:
b + (None, )

(1, '1', 3, None)

In [17]:
np.random.randn(3)

array([ 2.51287398,  0.43687488, -0.57080603])

In [None]:
M = np.random.rand(size = A.shape)
M = (M < prob) / (1 - prob)

In [7]:
a = np.array([[1, 2, 3], [4, 5, 6]])
b = np.array([[-1, 2, 3], [4,5,6]])
c = np.array([1,2,3]) 

In [None]:
c.ravel().tolist()

In [None]:
a.T.dot(a)

a is 1, 2


In [None]:
a = np.array([[1, 2, 3], [4, 5, 6]])

In [None]:
prob = np.exp(a - np.max(a, axis=0, keepdims=True)) 
prob

In [None]:
np.arange(300).shape

In [None]:
np.arange(3).shape

In [None]:
prob[[0, 1, 1], np.arange(3)]

In [None]:
np.sum(prob, axis=0, keepdims=True)

In [None]:
N = a.shape[1]

In [None]:
np.arange(N)

In [None]:
prob

In [None]:
prob[[1, 1, 0], np.arange(N)]

In [None]:
y_train.shape

In [None]:
x.shape

In [None]:
row_dim = x.shape[0]
row_dim

In [None]:
col_dim = np.prod(x.shape[1:])
col_dim

In [None]:
weight_size

In [None]:
X_train.shape


In [None]:
x.shape

In [None]:
row_dim = x.shape[0]
row_dim

In [None]:
x.shape[1:]

In [None]:
np.prod(x.shape[1:])

In [None]:
col_dim = np.prod(x.shape[1:])
col_dim

In [None]:
x_reshape = x.reshape(row_dim, col_dim)

In [None]:
x_reshape.shape

In [None]:
a

In [None]:
np.sum(a, axis = 1)