## HW5: Juan Arroyo. Collaborators: Hannes Koenig, Mark Vandergon

In [1]:
%load_ext autoreload

%autoreload 2

import numpy as np
from utils import *
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

In [2]:
# superclass of neural network "modules" (layers)
class Module:
    """
    Module is a super class. It could be a single layer, or a multilayer perceptron.
    """
    
    def __init__(self):
        self.train = True
        return
    
    def forward(self, _input):
        """
        z = f(a); a is the input, and h is the output.
        
        Inputs:
        _input: a
        
        Returns:
        output z
        """
        pass
    
    def backward(self, _input, _gradOutput):
        """
        Compute:
        gradient w.r.t. _input
        gradient w.r.t. trainable parameters
        
        Inputs (in lecture notation):
        _input: a 
        _gradOutput: dL/dz
        
        Returns:
        gradInput: dL/dz
        """
        pass
        
    def parameters(self):
        """
        Return the value of trainable parameters and its corresponding gradient (Used for grandient descent)
        
        Returns:
        params, gradParams
        """
        pass
    
    def training(self):
        """
        Turn the module into training mode.(Only useful for Dropout layer)
        Ignore it if you are not using Dropout.
        """
        self.train = True
        
    def evaluate(self):
        """
        Turn the module into evaluate mode.(Only useful for Dropout layer)
        Ignore it if you are not using Dropout.
        """
        self.train = False
        

In [3]:
# a class representing a sequence of modules (a layered network)
class Sequential(Module):
    """
    Sequential provides a way to plug layers together in a feed-forward manner.
    """
    def __init__(self):
        Module.__init__(self)
        self.layers = [] # layers contain all the layers in order
    
    def add(self, layer):
        self.layers.append(layer) # Add another layer at the end
    
    def size(self):
        return len(self.layers) # How many layers.
    
    def forward(self, _input):
        """
        Feed forward through all the layers, and return the output of the last layer
        """
        # self._inputs saves the input of each layer
        # self._inputs[i] is the input of i-th layer
        self._inputs = [_input]
        for i in range(self.size()):
            # The output of (i-1)-th layer as the _input of i-th layer
            self._inputs.append(self.layers[i].forward(self._inputs[i]))
        # The last element of self._inputs is the output of last layer
        self._output = self._inputs[-1]
        return self._output
    
    def backward(self, _input, _gradOutput):
        """
        Backpropogate through all the layers using chain rule.
        """
        # self._gradInputs[i] is the gradient of loss w.r.t. the input of i-th layer
        self._gradInputs = [None] * (self.size() + 1)
        self._gradInputs[self.size()] = _gradOutput
        for i in reversed(range(self.size())):
            self._gradInputs[i] = \
                self.layers[i].backward(self._inputs[i], self._gradInputs[i + 1])
        self._gradInput = self._gradInputs[0]
        return self._gradInput
    
    def parameters(self):
        """
        Return trainable parameters and its corresponding gradient in a nested list
        """
        params = []
        gradParams = []
        for m in self.layers:
            _p, _g = m.parameters()
            if _p is not None:
                params.append(_p)
                gradParams.append(_g)
        return params, gradParams

    def training(self):
        """
        Turn all the layers into training mode
        """
        Module.training(self)
        for m in self.layers:
            m.training()
    
    def evaluate(self):
        """
        Turn all the layers into evaluate mode
        """
        Module.evaluate(self)
        for m in self.layers:
            m.evaluate()
        

In [4]:
class FullyConnected(Module):
    """
    Fully connected layer (parameters include a matrix of weights a vector of biases)
    """
    def __init__(self, inputSize, outputSize):
        Module.__init__(self)
        # Initalization
        stdv = 1./np.sqrt(inputSize)
        
        self.weight = np.random.uniform(-stdv, stdv, (inputSize, outputSize))
        self.gradWeight = np.ndarray((inputSize, outputSize))
        print(self.gradWeight.shape, "Weight initial random values")
        self.bias = np.random.uniform(-stdv, stdv, outputSize)
        self.gradBias = np.ndarray(outputSize)
        
    def forward(self, _input):
        """
        output = W * input + b
        """
        self._input = _input
        self._output = np.dot(_input, self.weight) + self.bias
        return self._output
    
    def backward(self, _input, _gradOutput):
        """
        gradWeight = gradOutput * input
        gradBias = 
        gradInput =  gradWeight * gradOutput
        """
        self.gradWeight.fill(0)
        self.gradBias.fill(0)
        
        self.gradWeight += _input.T.dot(_gradOutput)
        self.gradBias += np.sum(_gradOutput,axis=0)
        self._gradInput = _gradOutput.dot(self.weight.T)
        return self._gradInput
        
    def parameters(self):
        """
        Return weight and bias and their g
        """
        return [self.weight, self.bias], [self.gradWeight, self.gradBias]


In [5]:
class ReLU(Module):
    """
    ReLU activation, not trainable.
    """
    def __init__(self):
        Module.__init__(self)
        return
    
    def forward(self, _input):
        """
        output = max(0, input)
        """
        self._input = _input
        self._output =  np.maximum(0, self._input)
        return self._output
    
    def backward(self, _input, _gradOutput):
        """
        gradInput = gradOutput * mask
        mask = _input > 0
        """
        self._gradInput = _gradOutput * (self._input > 0)
        return self._gradInput
        
    def parameters(self):
        """
        No trainable parametersm, return None
        """
        return None, None

In [6]:
class Sigmoid(Module):
    """
    sigmoid activation, not trainable.
    """
    def __init__(self):
        Module.__init__(self)
        return
    
    def forward(self, _input):
        """
        output = max(0, input)
        """
        self._input = _input
        self._output = 1. /  (1 + np.exp(-self._input))
        return self._output
    
    def backward(self, _input, _gradOutput):
        """
        gradInput = gradOutput * mask
        mask = _input > 0
        """
        self._gradInput = _gradOutput * (1. - self._output) * self._output
        return self._gradInput
        
    def parameters(self):
        """
        No trainable parametersm, return None
        """
        return None, None

In [7]:
# Optional
class Dropout(Module):
    """
    A dropout layer
    """
    def __init__(self, p = 0.5):
        Module.__init__(self)
        self.p = p #self.p is the drop rate, if self.p is 0, then it's a identity layer
        
    def forward(self, _input):
        self._output = _input
        if self.p > 0:
            if self.train:
                # Randomize a mask from bernoulli distrubition
                self.mask = np.random.binomial(1, 1 - self.p, _input.shape).astype('float64')
                # Scale the mask
                self.mask /= 1 - self.p
                self._output *= self.mask
        return self._output
    
    def backward(self, _input, _gradOutput):
        self._gradInput = _gradOutput
        if self.train:
            if self.p > 0:
                self._gradInput *= self.mask
        return self._gradInput
    
    def parameters(self):
        """
        No trainable parameters.
        """
        return None, None

In [8]:
class SoftMaxLoss(object):
    def __init__(self):
        return
        
    def forward(self, _input, _label):
        """
        Softmax and cross entropy loss layer. Should return a scalar, since it's a
        loss. (It's almost identical to what we had in Pset 2)

        Inputs:
        _input: N x C
        _labels: N x C, one-hot

        Returns: loss (scalar)
        """
        self._input = _input - _input.max(1)[:, np.newaxis]
        self._logprob = self._input - np.log(np.exp(self._input).sum(1)[:, np.newaxis])
        self._output = np.mean(np.sum(-self._logprob * _label, 1))
        return self._output
    
    def backward(self, _input, _label):
        self._gradInput = (np.exp(self._logprob) - _label) / _label.shape[0]
        return self._gradInput

In [9]:
# Test softmaxloss, the relative error should be small enough
def test_sm():
    crit = SoftMaxLoss()
    gt = np.zeros((3, 10))
    gt[np.arange(3), np.array([1,2,3])] = 1
    x = np.random.random((3,10))
    def test_f(x):
        return crit.forward(x, gt)

    print(crit.forward(x, gt))

    gradInput = crit.backward(x, gt)
    gradInput_num = numeric_gradient(test_f, x, 1, 1e-6)
    print("my grad", gradInput)
    print("numerical", gradInput_num)
    print(relative_error(gradInput, gradInput_num, 1e-8))
    error = relative_error(gradInput, gradInput_num, 1e-8)
    print("error", error)
    print(error <=1e-6)
    
test_sm()


2.31738759809
my grad [[ 0.02987379 -0.29002301  0.01666303  0.03417416  0.04503975  0.03570464
   0.03440032  0.02966941  0.02641884  0.03807907]
 [ 0.02959471  0.0404012  -0.3086131   0.02108355  0.03496388  0.04611156
   0.03115231  0.02477963  0.04192224  0.03860401]
 [ 0.03208438  0.03323847  0.0210828  -0.3002426   0.03050131  0.03979036
   0.0361958   0.04716939  0.04092929  0.0192508 ]]
numerical [[ 0.02987379 -0.29002301  0.01666303  0.03417416  0.04503975  0.03570464
   0.03440032  0.02966941  0.02641884  0.03807907]
 [ 0.02959471  0.0404012  -0.3086131   0.02108355  0.03496388  0.04611156
   0.03115231  0.02477963  0.04192224  0.03860401]
 [ 0.03208438  0.03323847  0.0210828  -0.3002426   0.03050131  0.03979036
   0.0361958   0.04716939  0.04092929  0.0192508 ]]
7.93413402274e-09
error 7.93413402274e-09
True


In [10]:
# Test modules, all the relative errors should be small enough (on the order of 1e-6 or smaller)
def test_module(model):

    model.evaluate()

    crit = TestCriterion()
    gt = np.random.random((3,10))
    x = np.random.random((3,10))
    def test_f(x):
        return crit.forward(model.forward(x), gt)

    gradInput = model.backward(x, crit.backward(model.forward(x), gt))
    gradInput_num = numeric_gradient(test_f, x, 1, 1e-6)
    print(relative_error(gradInput, gradInput_num, 1e-8))

# Test fully connected
model = FullyConnected(10, 10)
print('testing FullyConnected')
test_module(model)

# Test ReLU
model = ReLU()
print('testing ReLU')
test_module(model)

# Test Dropout
model = Dropout()
print("testing dropout")
test_module(model)

# Test Sequential
model = Sequential()
model.add(FullyConnected(10, 10))
model.add(ReLU())
model.add(FullyConnected(10, 10))
#model.add(Dropout())
print('testing 2-layer model')
test_module(model)

(10, 10) Weight initial random values
testing FullyConnected
3.81634373313e-09
testing ReLU
7.36022825478e-10
testing dropout
7.36022825478e-10
(10, 10) Weight initial random values
(10, 10) Weight initial random values
testing 2-layer model
2.97047503507e-08


In [11]:
# Test gradient descent, the loss should be lower and lower
trainX = np.random.random((10,5))

model = Sequential()
model.add(FullyConnected(5, 3))
model.add(ReLU())
model.add(Dropout())
model.add(FullyConnected(3, 1))

crit = TestCriterion()

params, gradParams = model.parameters()

it = 0
state = None
while True:
    output = model.forward(trainX)
    loss = crit.forward(output, None)
    if it % 100 == 0:
        print(loss)
    doutput = crit.backward(output, None)
    model.backward(trainX, doutput)
    sgdm(params, gradParams, 0.01, 0.8, state)
    if it > 1000:
        break
    it += 1
    

(5, 3) Weight initial random values
(3, 1) Weight initial random values
0.530324502841
0.00967549715902
0.00967549715902
0.00967549715902
0.00967549715902
0.00967549715902
0.00967549715902
0.00967549715902
0.00967549715902
0.00967549715902
0.00967549715902


Now we start to work on real data. The first one is Fashion MNIST.

In [12]:
import FMNIST_utils

# We only consider large set this time
print("Load large trainset.")
Xlarge,Ylarge = FMNIST_utils.load_data("Tr")
print(Xlarge.shape)
print(Ylarge.shape)

print("Load valset.")
Xval,Yval = FMNIST_utils.load_data("Vl")
print(Xval.shape)
print(Yval.shape)

Load large trainset.
(50000, 784)
(50000, 10)
Load valset.
(10000, 784)
(10000, 10)


In [13]:
def predict(X, model):
    """
    Evaluate the soft predictions of the model.
    Input:
    X : N x d array (no unit terms)
    model : a multi-layer perceptron
    Output:
    yhat : N x C array
        yhat[n][:] contains the score over C classes for X[n][:]
    """
    return model.forward(X)

def error_rate(X, Y, model):
    """
    Compute error rate (between 0 and 1) for the model
    """
    model.evaluate()
    res = 1 - (model.forward(X).argmax(-1) == Y.argmax(-1)).mean()
    model.training()
    return res

from copy import deepcopy

def runTrainVal(X,Y,model,Xval,Yval,trainopt):
    """
    Run the train + evaluation on a given train/val partition
    trainopt: various (hyper)parameters of the training procedure
    During training, choose the model with the lowest validation error. (early stopping)
    Assumes (global) variable crit containing the loss (training "criterion" to be minimized)
    """
    
    params, gradParams = model.parameters()
    
    eta = trainopt['eta']
    
    N = X.shape[0] # number of data points in X
    
    # Save the model with lowest validation error
    minValError = np.inf
    saved_model = None # Save the best model accoring to validation error
    
    shuffled_idx = np.random.permutation(N)
    start_idx = 0
    for iteration in range(trainopt['maxiter']):
        if iteration % int(trainopt['eta_frac'] * trainopt['maxiter']) == 0:
            eta *= trainopt['etadrop']
        # form the next mini-batch
        stop_idx = min(start_idx + trainopt['batch_size'], N)
        batch_idx = range(N)[int(start_idx):int(stop_idx)]
        
        s_idx = shuffled_idx[batch_idx]
        
        bX = X[s_idx,:]
        bY = Y[s_idx,:]

        score = model.forward(bX)
        loss = crit.forward(score, bY)
        # note: this computes loss on the *batch* only, not on the entire training set!
        
        dscore = crit.backward(score, bY)
        model.backward(bX, dscore)
        
        # Update the data using preferred update rule
        
        if trainopt['update'] == 'sgdm':
            sgdm(params, gradParams, eta, weight_decay = trainopt['lambda'])    
        elif trainopt['update'] == 'sgd':
            sgd(params, gradParams, eta, weight_decay = trainopt['lambda'])
        elif trainopt['update'] == 'nesterov':
            sgdmom(params, gradParams, eta, weight_decay = trainopt['lambda'])


        start_idx = stop_idx % N
        
        if (iteration % trainopt['display_iter']) == 0:
            #compute train and val error; multiply by 100 for readability (make it percentage points)
            trainError = 100 * error_rate(X, Y, model)
            valError = 100 * error_rate(Xval, Yval, model)
            print('{:8} batch loss: {:.3f} train error: {:.3f} val error: {:.3f}'.format(iteration, loss, trainError, valError))
            
            # early stopping: save the best model snapshot so far (i.e., model with lowest val error)
            if valError < minValError:
                saved_model = deepcopy(model)
                minValError = valError
        
    return saved_model, minValError, trainError

In [14]:
def build_model(input_size, hidden_size, output_size, activation_func = 'ReLU', dropout = 0):
    """
    Build a model:
    input_size: the dimension of input data
    hidden_size: the dimension of hidden vector, hidden_size == 0 means only one layer;
        hidden_size = [h1, h2, ...] specifies multiple layers of sizes h1, h2 etc.
    output_size: the output size of final layer (typically, number of classes).
    activation_func: ReLU, sigmoid (defined above), Tanh (you'd have to define), etc. 
    dropout: the dropout rate: if dropout == 0, this is equivalent to no dropout
    """
    model = Sequential()
    
    if type(hidden_size) is int:
        hidden_size = [hidden_size] # ensure it's a list
    
    prev_size=input_size
    
    # add hidden layer(s) as requested
    if hidden_size[0] == 0: # no hidden layer
        pass
    
    else:
        for l in range(len(hidden_size)):
            model.add(FullyConnected(prev_size, hidden_size[l]))
            prev_size=hidden_size[l]
            
            if activation_func == 'ReLU':
                model.add(ReLU())
            elif activation_func == 'sigmoid':
                model.add(Sigmoid())
                
            if dropout > 0:
                model.add(Dropout(dropout))
                
    # now add output layer  
    model.add(FullyConnected(prev_size, output_size))

        
    return model
    

Below is an example of how one can define, train and evaluate a model (in this case a three-layer model, with 200 units in each hidden layer, for Fashion MNIST)

## First Training: 200 units per layer, 5 hidden layers

In [15]:
trainopt = {
    'eta': 1e-3,   # initial learning rate
    'maxiter': 60000,   # max number of iterations (updates) of SGD
    'display_iter': 5000,  # display batch loss every display_iter updates
    'batch_size': 128,  
    'etadrop': .5, # when dropping eta, multiply it by this number (e.g., .5 means halve it)
    'eta_frac': .25,  # drop eta after every eta_frac*maxiter
    'update': 'sgdm' # SGD with momentum (using the default momentum value, see utils.py)
}

NFEATURES = Xlarge.shape[1]

# we will maintain a record of models trained for different values of lambda
# these will be indexed directly by lambda value itself
trained_models = dict()

# set the (initial?) set of hyperparameters to explore

lambda_=0.0001
hidden_size_=[200,200,200,200,200] # five hidden layers, 200 units each

trainopt['lambda'] = lambda_
model = build_model(NFEATURES, hidden_size_, 10, dropout = 0.8)
crit = SoftMaxLoss()
# -- model trained on large train set
trained_model,valErr,trainErr = runTrainVal(Xlarge, Ylarge, model, Xval, Yval, trainopt)
trained_models[lambda_] = {'model': trained_model, "val_err": valErr, "train_err": trainErr }
print('train set model [ h = ',end='')
for l in range(len(hidden_size_)):
    print('%d '%hidden_size_[l],end='')
print(' ], lambda= %.4f ] --> train error: %.2f, val error: %.2f' % (lambda_, trainErr, valErr))


(784, 200) Weight initial random values
(200, 200) Weight initial random values
(200, 200) Weight initial random values
(200, 200) Weight initial random values
(200, 200) Weight initial random values
(200, 10) Weight initial random values
       0 batch loss: 58.513 train error: 88.830 val error: 88.690
    5000 batch loss: 2.185 train error: 81.898 val error: 81.540
   10000 batch loss: 2.235 train error: 83.212 val error: 82.780
   15000 batch loss: 1.928 train error: 86.564 val error: 86.030
   20000 batch loss: 2.119 train error: 87.650 val error: 87.070
   25000 batch loss: 2.138 train error: 87.242 val error: 86.750
   30000 batch loss: 1.905 train error: 82.124 val error: 81.880
   35000 batch loss: 1.919 train error: 82.384 val error: 82.180
   40000 batch loss: 2.060 train error: 83.574 val error: 83.060
   45000 batch loss: 2.079 train error: 85.518 val error: 85.100
   50000 batch loss: 2.038 train error: 86.454 val error: 85.850
   55000 batch loss: 1.950 train error: 87.36

## Second Training: 1000 units per layer,  2 hidden layers

In [None]:
trainopt = {
    'eta': 1e-3,   # initial learning rate
    'maxiter': 60000,   # max number of iterations (updates) of SGD
    'display_iter': 5000,  # display batch loss every display_iter updates
    'batch_size': 128,  
    'etadrop': .5, # when dropping eta, multiply it by this number (e.g., .5 means halve it)
    'eta_frac': .25,  # drop eta after every eta_frac*maxiter
    'update': 'sgdm' # SGD with momentum (using the default momentum value, see utils.py)
}

NFEATURES = Xlarge.shape[1]

# we will maintain a record of models trained for different values of lambda
# these will be indexed directly by lambda value itself
trained_models = dict()

# set the (initial?) set of hyperparameters to explore

lambda_=0.0001
hidden_size_=[1000,1000] # two hidden layers, 1000 units each

trainopt['lambda'] = lambda_
model = build_model(NFEATURES, hidden_size_, 10, dropout = 0.8)
crit = SoftMaxLoss()
# -- model trained on large train set
trained_model,valErr,trainErr = runTrainVal(Xlarge, Ylarge, model, Xval, Yval, trainopt)
trained_models[lambda_] = {'model': trained_model, "val_err": valErr, "train_err": trainErr }
print('train set model [ h = ',end='')
for l in range(len(hidden_size_)):
    print('%d '%hidden_size_[l],end='')
print(' ], lambda= %.4f ] --> train error: %.2f, val error: %.2f' % (lambda_, trainErr, valErr))


(784, 1000) Weight initial random values
(1000, 1000) Weight initial random values
(1000, 10) Weight initial random values
       0 batch loss: 86.717 train error: 78.430 val error: 78.670
    5000 batch loss: 1.020 train error: 22.252 val error: 23.670
   10000 batch loss: 0.801 train error: 19.712 val error: 21.310


In [89]:
best_model = trained_models[0.0001]['model']

In [92]:
#Generate a Kaggle submission file using best_trained_model which you should set based on your experiments
kaggleX = FMNIST_utils.load_data('kaggle')
kaggleYhat = predict(kaggleX, best_model).argmax(-1)
save_submission('submission-mnist.csv', kaggleYhat)

Saved: submission-mnist.csv


## Comments on MNIST data set

For the MNIST data set, I trained two different models. One with 5 hidden layers and 200 units per layer and the other with only two hidden layers, but 1000 units per layer.

I chose the same dropout rate (0.8) for both models, which seems to be the recommended rate for images in the following paper : *Dropout: A Simple Way to Prevent Neural Networks from Overfitting* https://www.cs.toronto.edu/~hinton/absps/JMLRdropout.pdf

As the results show, a wider network seems to do a better job than a deeper network in these examples. The best model was the one with two hidden layers and 1000 units per layer.


## Consumer Review Data

Now let's work on consumer review data (from Pset 3). We will use essentially the same code for the model and training as for Fashion MNIST, but apply it to another data set, with another set of feature functions (mapping text to vectors, instead of mapping images to vectors).

In [93]:
import CR_utils
X, Y, keys = CR_utils.preprocess(use_bigram = False,mincount=3)
X_train = X['train']
Y_train = CR_utils.binarize_labels(np.expand_dims(Y['train'],1))
X_val = X['val']
Y_val = CR_utils.binarize_labels(np.expand_dims(Y['val'],1))


Feature size:  2104


Again an example of how to train a model

# Default Model : One layer, 400 units

In [101]:
# -- training options
trainopt = {
    'eta': 1e-1,   # initial learning rate
    'maxiter': 10000,   # max number of iterations (updates) of SGD
    'display_iter': 1000,  # display batch loss every display_iter updates
    'batch_size': 1,  
    'etadrop': .5, # when dropping eta, multiply it by this number (e.g., .5 means halve it)
    'eta_frac': .25,  # how ofter to drop eta
    'update': 'sgdm'
}

NFEATURES = len(keys)

lambda_=0.0001
hidden_size_=[400] # one hidden layer, 400 units

trainopt['lambda'] = lambda_
model = build_model(NFEATURES, hidden_size_, 2, dropout = 0.2)
crit = SoftMaxLoss()
# -- model trained on large train set
trained_model,valErr,trainErr = runTrainVal(X_train, Y_train, model, X_val, Y_val, trainopt)
trained_models[lambda_] = {'model': trained_model, "val_err": valErr, "train_err": trainErr }
print('train set model [ h = ',end='')
for l in range(len(hidden_size_)):
    print('%d '%hidden_size_[l],end='')
print(' ], lambda= %.4f ] --> train error: %.2f, val error: %.2f' % (lambda_, trainErr, valErr))





(2104, 400) Weight initial random values
(400, 2) Weight initial random values
       0 batch loss: 0.710 train error: 41.477 val error: 39.600
    1000 batch loss: 0.735 train error: 24.685 val error: 32.000
    2000 batch loss: 0.399 train error: 21.189 val error: 27.200
    3000 batch loss: 0.113 train error: 14.450 val error: 28.200
    4000 batch loss: 0.066 train error: 14.378 val error: 27.000
    5000 batch loss: 0.527 train error: 11.928 val error: 27.200
    6000 batch loss: 0.117 train error: 8.288 val error: 24.800
    7000 batch loss: 0.228 train error: 9.946 val error: 27.600
    8000 batch loss: 0.003 train error: 7.820 val error: 25.200
    9000 batch loss: 0.181 train error: 5.225 val error: 25.800
train set model [ h = 400  ], lambda= 0.0001 ] --> train error: 5.23, val error: 24.80


## Second Training: 3 hidden layers, 800 units per layer

In [95]:
# -- training options
trainopt = {
    'eta': 1e-1,   # initial learning rate
    'maxiter': 10000,   # max number of iterations (updates) of SGD
    'display_iter': 1000,  # display batch loss every display_iter updates
    'batch_size': 1,  
    'etadrop': .5, # when dropping eta, multiply it by this number (e.g., .5 means halve it)
    'eta_frac': .25,  # how ofter to drop eta
    'update': 'sgdm'
}

NFEATURES = len(keys)

lambda_=0.0001
hidden_size_=[800, 800, 800] # one hidden layer, 400 units

trainopt['lambda'] = lambda_
model = build_model(NFEATURES, hidden_size_, 2, dropout = 0.2)
crit = SoftMaxLoss()
# -- model trained on large train set
trained_model,valErr,trainErr = runTrainVal(X_train, Y_train, model, X_val, Y_val, trainopt)
trained_models[lambda_] = {'model': trained_model, "val_err": valErr, "train_err": trainErr }
print('train set model [ h = ',end='')
for l in range(len(hidden_size_)):
    print('%d '%hidden_size_[l],end='')
print(' ], lambda= %.4f ] --> train error: %.2f, val error: %.2f' % (lambda_, trainErr, valErr))





(2104, 800) Weight initial random values
(800, 800) Weight initial random values
(800, 800) Weight initial random values
(800, 2) Weight initial random values
       0 batch loss: 0.701 train error: 36.360 val error: 33.000
    1000 batch loss: 0.882 train error: 33.802 val error: 31.200
    2000 batch loss: 0.383 train error: 26.631 val error: 33.400
    3000 batch loss: 0.606 train error: 18.811 val error: 28.800
    4000 batch loss: 0.488 train error: 20.396 val error: 29.000
    5000 batch loss: 0.020 train error: 15.063 val error: 26.600
    6000 batch loss: 0.606 train error: 11.207 val error: 27.600
    7000 batch loss: 0.069 train error: 9.838 val error: 27.000
    8000 batch loss: 0.073 train error: 6.739 val error: 27.400
    9000 batch loss: 0.155 train error: 5.225 val error: 26.200
train set model [ h = 800 800 800  ], lambda= 0.0001 ] --> train error: 5.23, val error: 26.20


In [102]:
best_model_cr = trained_models[0.0001]['model']

In [103]:
y_hat = predict(X['test'], best_model_cr).argmax(-1) * 2 - 1
CR_utils.save_submission('submission-CR.csv', y_hat)

# Comments on Consumer Data

For the consumer data set, I decided to train two models: 
 - The model with the default parameters provided in the assignment. 
 - Using what I learned from training the MNIST data, I increased the units to 800 and added to layers.
While the model with the default paremeters seems to have a smaller error on the validation set, our intuition is that a more complex model (complexity in terms of more units and layers) might be better to learn from the features in the data set.

After sending both predictions to Kaggle, it seems that the more complex model had better predictions.