Best results are :
EPOCH_NUM = 50
BATCH_SIZE = 32
hid_state_size = 128
out_size = 6
T = 150

LR_arr = [0.0007, 0.001, 0.003] (especially 0)
hid_layer_size_arr = [64]#, 256]#16, 32, 64, 128, 256]
alpha_arr = [0.85]#, 0.50 ,0.85]

In [1]:
import h5py as h5
import numpy as np
from matplotlib import pyplot as plt

In [2]:
# Import datasets
f = h5.File("assign3_data3.h5", "r")

# Convert them to np array
trainX = np.array(f['trX'])
testX = np.array(f['tstX'])
trainY = np.array(f['trY'])
testY = np.array(f['tstY'])
f.close()

train_sample = np.random.choice(3000, 2700, replace=False)
train_sample = train_sample.reshape((train_sample.shape[0]))
validation_sample = np.array(list(set(range(3000)) - set(train_sample.reshape((2700)))))
X_train = trainX.reshape((trainX.shape[0], trainX.shape[2], trainX.shape[1]))[train_sample]
X_val = trainX.reshape((trainX.shape[0], trainX.shape[2], trainX.shape[1]))[validation_sample]
y_train = trainY[train_sample]
y_val = trainY[validation_sample]
X_test = testX.reshape((testX.shape[0], testX.shape[2], testX.shape[1]))
y_test = testY
X_train.shape, y_train.shape, X_val.shape, y_val.shape, X_test.shape, y_test.shape
X_train = X_train/4
X_val = X_val/4
X_test = X_test/4

In [3]:
def cell_forward(h_t_1, x_t, W_h_h, W_x_h, b_h):
    h_t = tanh(h_t_1.dot(W_h_h) + x_t.dot(W_x_h) + b_h)
    return h_t

def forward(x, h_init, W, b, T):
    H = np.zeros((x.shape[0], W['h_h'].shape[1], T))
    
    cur_h = h_init
    for t in range(T):
        cur_h = cell_forward(cur_h, x[:, :, t], W['h_h'], W['x_h'], b['h'])
        H[:,:,t] = cur_h
    
    hidden = RELU(cur_h.dot(W['1']) + b['1'])
    #print(hidden)
    O = softmax(hidden.dot(W['2']) + b['2'])
    
    return H, hidden, O

def cell_backward(delta_h_t, h_t_1, x_t, W_h_h, W_x_h, b_h):
    W_h_h_grad = h_t_1.transpose().dot(delta_h_t)
    W_x_h_grad = x_t.transpose().dot(delta_h_t)
    b_h_grad = np.ones((x_t.shape[0], 1)).transpose().dot(delta_h_t)
    
    e_h_t_1 = delta_h_t.dot(W_h_h.transpose())
    delta_h_t_1 = np.multiply(e_h_t_1, tanh_backprop(h_t_1))
    
    return delta_h_t_1, W_h_h_grad, W_x_h_grad, b_h_grad

def backward(x, y, h_init, H, hidden, O, W, b, T):
    W_grad = {}
    b_grad = {}
    delta_y = O - y
    W_grad['2'] = hidden.transpose().dot(delta_y)
    b_grad['2'] = np.ones((hidden.shape[0], 1)).transpose().dot(delta_y)
    
    e = delta_y.dot(W['2'].transpose())
    delta_hidden = np.multiply(e, RELU_backward(hidden))
    W_grad['1'] = H[:,:,-1].transpose().dot(delta_hidden)
    b_grad['1'] = np.ones((H[:,:,-1].shape[0], 1)).transpose().dot(delta_hidden)
    
    e = delta_hidden.dot(W['1'].transpose())
    cur_delta_h = np.multiply(e, tanh_backprop(H[:,:,-1]))
    
    W_grad['h_h'] = 0
    W_grad['x_h'] = 0
    b_grad['h'] = 0
    
    for t in range(T-1, 0, -1):
        cur_delta_h, cur_W_h_h_grad, cur_W_x_h_grad, cur_b_h_grad = cell_backward(cur_delta_h, H[:,:,t-1], x[:,:,t], W['h_h'], W['x_h'], b['h'])
        W_grad['h_h'] += cur_W_h_h_grad
        W_grad['x_h'] += cur_W_x_h_grad
        b_grad['h'] += cur_b_h_grad
    
    _, cur_W_h_h_grad, cur_W_x_h_grad, cur_b_h_grad = cell_backward(cur_delta_h, np.ones((x.shape[0], 1)).dot(h_init), x[:,:,0], W['h_h'], W['x_h'], b['h'])
    W_grad['h_h'] += cur_W_h_h_grad
    W_grad['x_h'] += cur_W_x_h_grad
    b_grad['h'] += cur_b_h_grad
    h_grad = np.sum(cur_delta_h.dot(W['h_h'].transpose()),axis=0).reshape((1, h_init.shape[1]))
    
    n = x.shape[0]
    W_grad['1'] /= n
    W_grad['2'] /= n
    W_grad['h_h'] /= n
    W_grad['x_h'] /= n
    b_grad['h'] /= n
    b_grad['1'] /= n
    b_grad['2'] /= n
    h_grad /= n
    return W_grad, b_grad, h_grad

In [4]:
def softmax(Z):
    e = np.exp(Z)
    row_sum = np.sum(e, axis=1).reshape((e.shape[0], 1))
    return e / row_sum

def RELU(X):
    return np.multiply(X, X > 0)

def RELU_backward(X):
    return 1*(X > 0)

def tanh(X):
    return ( np.exp(2*X) - 1 ) / ( np.exp(2 * X) + 1)

def tanh_backprop(X):
    return 1 - X**2

def unison_shuffled_copies(a, b):
    assert a.shape[0] == b.shape[0]
    p = np.random.permutation(a.shape[0])
    return a[p], b[p]

def initialize_weights(in_size, hid_state_size, hid_layer_size, out_size):
    #https://stackoverflow.com/questions/44883861/initial-bias-values-for-a-neural-network
    W = {}
    b = {}
    t = (6/(in_size+hid_state_size))**(1/2)
    W['x_h'] = np.random.uniform(low=-1*t, high=t, size=(in_size, hid_state_size))
    t = (3/(hid_state_size))**(1/2)
    W['h_h'] = np.random.uniform(low=-1*t, high=t, size=(hid_state_size, hid_state_size))
    b['h'] = np.zeros((1, hid_state_size))
    #b['h'] = np.random.uniform(low=-1*t, high=t, size=(1, hid_state_size))
    t = (6/(hid_state_size + hid_layer_size))**(1/2)
    W['1'] = np.random.uniform(low=-1*t, high=t, size=(hid_state_size, hid_layer_size))
    b['1'] = np.zeros((1, hid_layer_size))
    #b['1'] = np.random.uniform(low=-1*t, high=t, size=(1, hid_layer_size))
    t = (6/(hid_layer_size + out_size))**(1/2)
    W['2'] = np.random.uniform(low=-1*t, high=t, size=(hid_layer_size, out_size))
    b['2'] = np.zeros((1, out_size))
    #b['2'] = np.random.uniform(low=-1*t, high=t, size=(1, out_size))
    return W, b

def predict(X_test, y_test, T, h_init, W, b): # Counts true if the true word is in the top ten predictions
    _, __, outputs = forward(X_test, h_init, W, b, T)
    correct = np.argmax(y_test, axis=1)
    predictions = np.argmax(outputs, axis=1)
    return np.sum(correct == predictions) / y_test.shape[0]

def calculate_errors(o, d):
    target_predictions = np.multiply(o, d) # Only target class probs
    target_predictions = np.sum(target_predictions, axis=1) 
    return -1 * np.sum(np.log(target_predictions)) # -yi.log(pi)

In [7]:
def train(EPOCH_NUM, BATCH_SIZE, LR, alpha, hid_state_size, hid_layer_size, out_size, T, X_train, X_val, y_train, y_val):
    print(f'Train for epoch:{EPOCH_NUM}, batch size:{BATCH_SIZE}, lr: {LR}')

    W, b = initialize_weights(3, hid_state_size, hid_layer_size, out_size) # initialize the weights

    # CSE records
    train_CSE = []
    val_CSE = []
    val_acc = []

    delta_prev = None # For momentum
    batch_num = int(np.ceil(X_train.shape[0] / BATCH_SIZE))
    h_init = np.zeros((1, hid_state_size))
    patience = 2
    W_1 = None
    b_1 = None
    h_1 = None
    W_2 = None
    b_2 = None
    h_2 = None
    for epoch in range(EPOCH_NUM):
        if epoch > 0 and epoch % 1 == 0: # Report metrics
            # TODO
            train_accuracy = predict(X_train, y_train, T, h_init, W, b) # Counts true if the true word is in the top ten predictions
            val_accuracy = predict(X_val, y_val, T, h_init, W, b)
            print('Epoch:', epoch)
            print('Train CSE:', train_CSE[-1])
            print('Validation CSE:', val_CSE[-1])
            print('Train accuracy:', train_accuracy)
            print('Validation accuracy:', val_accuracy)
        # Shuffle dataset
        shuffled_X, shuffled_y = unison_shuffled_copies(X_train, y_train)

        totalCSE = 0

        beginning = 0

        for i in range(batch_num):
            if i < batch_num - 1:
                X = shuffled_X[beginning: beginning + BATCH_SIZE] 
                y = shuffled_y[beginning: beginning + BATCH_SIZE]
            else:
                X = shuffled_X[beginning: ] 
                y = shuffled_y[beginning: ]
            beginning += BATCH_SIZE
            
            # TODO
            ####### Calculate activations and errors ############
            H, hidden, O = forward(X, h_init, W, b, T) # Forward pass
            totalCSE += calculate_errors(O, y) # Loss calculation
            #####################################################
            W_grad, b_grad, h_grad = backward(X, y, h_init, H, hidden, O, W, b, T) # Backpropagation algorithm
            ##### Delta calculations for momentum ######
            delta_W_h_h = -1 * LR * W_grad['h_h'] 
            delta_W_x_h = -1 * LR * W_grad['x_h']
            delta_W_1 = -1 * LR * W_grad['1']
            delta_W_2 = -1 * LR * W_grad['2']
            delta_b_h = -1 * LR * b_grad['h']
            delta_b_1 = -1 * LR * b_grad['1']
            delta_b_2 = -1 * LR * b_grad['2']
            delta_h = -1 * LR * h_grad
            if delta_prev != None:
                delta_W_h_h += alpha * delta_prev['W_h_h']
                delta_W_x_h += alpha * delta_prev['W_x_h']
                delta_W_1 += alpha * delta_prev['W_1']
                delta_W_2 += alpha * delta_prev['W_2']
                delta_b_h += alpha * delta_prev['b_h']
                delta_b_1 += alpha * delta_prev['b_1']
                delta_b_2 += alpha * delta_prev['b_2']
                delta_h += alpha * delta_prev['h_init']
                
            delta_prev = {'W_h_h': delta_W_h_h, 'W_x_h': delta_W_x_h, 'W_1': delta_W_1, 'W_2': delta_W_2, 'b_h': delta_b_h, 'b_1': delta_b_1, 'b_2': delta_b_2, 'h_init': delta_h}
            ###### Update weights ###############
            W_2 = W_1
            b_2 = b_1
            h_2 = h_1
            W_1 = W
            b_1 = b
            h_1 = h_init
            W['h_h'] += delta_W_h_h
            W['x_h'] += delta_W_x_h
            W['1'] += delta_W_1
            W['2'] += delta_W_2
            b['h'] += delta_b_h
            b['2'] += delta_b_2
            b['1'] += delta_b_1
            h_init += delta_h
            #####################################

        train_CSE.append(totalCSE / X_train.shape[0])
        H, hidden, O = forward(X_val, h_init, W, b, T) # Forward pass
        val_CSE.append(calculate_errors(O, y_val) / X_val.shape[0])
        val_accuracy = predict(X_val, y_val, T, h_init, W, b)
        val_acc.append(val_accuracy)
        if len(val_CSE) > 1: # Early stopping if validation loss starts to increase or it is stationary
            if val_CSE[-1] > val_CSE[-2]:
                if patience == 1:
                    print('Finished at epoch', epoch)
                    return train_CSE, val_CSE, val_acc, W_2, b_2, h_2
                else:
                    W_last = W
                    patience -= 1
            else:
                patience = 2

    return train_CSE, val_CSE, val_acc, W, b, h_init

In [57]:
EPOCH_NUM = 50
BATCH_SIZE = 32
hid_state_size = 128
out_size = 6
T = 150

LR_arr = [0.0007, 0.001, 0.003]
hid_layer_size_arr = [64]#, 256]#16, 32, 64, 128, 256]
alpha_arr = [0.85]#, 0.50 ,0.85]

results_dummy = []
for LR in LR_arr:
    for hid_layer_size in hid_layer_size_arr:
        for alpha in alpha_arr:
            results_dummy.append(train(EPOCH_NUM, BATCH_SIZE, LR, alpha, hid_state_size, hid_layer_size, out_size, T, X_train, X_val, y_train, y_val))

Train for epoch:50, batch size:32, lr: 0.0007
Epoch: 5
Train CSE: 1.3735826137100622
Validation CSE: 1.4588204951755253
Train accuracy: 0.40703703703703703
Validation accuracy: 0.38666666666666666
Epoch: 10
Train CSE: 1.305738768151935
Validation CSE: 1.3031544579042922
Train accuracy: 0.48962962962962964
Validation accuracy: 0.4666666666666667
Epoch: 15
Train CSE: 1.6457348437023147
Validation CSE: 1.6466453855485574
Train accuracy: 0.26407407407407407
Validation accuracy: 0.25
Epoch: 20
Train CSE: 1.504286892456163
Validation CSE: 1.4899023843310208
Train accuracy: 0.3781481481481481
Validation accuracy: 0.34
Epoch: 25
Train CSE: 1.3893517293721331
Validation CSE: 1.6062932434318287
Train accuracy: 0.29777777777777775
Validation accuracy: 0.2966666666666667
Epoch: 30
Train CSE: 1.7054323817467338
Validation CSE: 1.7086455805936236
Train accuracy: 0.29703703703703704
Validation accuracy: 0.3233333333333333
Epoch: 35
Train CSE: 1.3360942299087177
Validation CSE: 1.3498908206311546
Trai

KeyboardInterrupt: 

In [65]:
results_dummy[0][2][-1]

0.45666666666666667

In [9]:
EPOCH_NUM = 50
BATCH_SIZE = 32
hid_state_size = 128
out_size = 6
T = 150

LR_arr = [0.001, 0.01, 0.1]
hid_layer_size_arr = [64, 256]#16, 32, 64, 128, 256]
alpha_arr = [0.1, 0.50 ,0.85]

results_train = {}
results_val = {}
results_W = {}
results_b = {}
results_h = {}
for LR in LR_arr:
    for hid_layer_size in hid_layer_size_arr:
        for alpha in alpha_arr:
            a, b, c, d, e = train(EPOCH_NUM, BATCH_SIZE, LR, alpha, hid_state_size, hid_layer_size, out_size, T, X_train, X_val, y_train, y_val)
            results_train[f'{LR}-{hid_layer_size}-{alpha}'] = a
            results_val[f'{LR}-{hid_layer_size}-{alpha}'] = b
            results_W[f'{LR}-{hid_layer_size}-{alpha}'] = c
            results_b[f'{LR}-{hid_layer_size}-{alpha}'] = d
            results_h[f'{LR}-{hid_layer_size}-{alpha}'] = e

Train for epoch:50, batch size:32, lr: 0.001
Epoch: 5
Train CSE: 1.6081058379652637
Validation CSE: 1.609592001547455
Train accuracy: 0.38814814814814813
Validation accuracy: 0.38666666666666666
Finished at epoch 7
Epoch: 10
Train CSE: 1.442667396204285
Validation CSE: 1.4603910603135584
Train accuracy: 0.4162962962962963
Validation accuracy: 0.38666666666666666
Finished at epoch 10
Finished at epoch 13
Epoch: 15
Train CSE: 1.3585142371106995
Validation CSE: 1.3937872015929942
Train accuracy: 0.4737037037037037
Validation accuracy: 0.43333333333333335
Finished at epoch 15
Finished at epoch 17
Epoch: 20
Train CSE: 1.7114887286636276
Validation CSE: 1.7492631542240098
Train accuracy: 0.26222222222222225
Validation accuracy: 0.24333333333333335
Finished at epoch 21
Finished at epoch 23
Finished at epoch 24
Epoch: 25
Train CSE: 1.5543221794845201
Validation CSE: 1.7417461187047478
Train accuracy: 0.2388888888888889
Validation accuracy: 0.23
Finished at epoch 27
Finished at epoch 29
Epoch: 

Finished at epoch 31
Epoch: 35
Train CSE: 1.324875568651365
Validation CSE: 1.321631481684067
Train accuracy: 0.46111111111111114
Validation accuracy: 0.44666666666666666
Finished at epoch 35
Finished at epoch 38
Epoch: 40
Train CSE: 1.2573575283477882
Validation CSE: 1.3016804836282494
Train accuracy: 0.5022222222222222
Validation accuracy: 0.45
Finished at epoch 41
Epoch: 45
Train CSE: 1.2525570216487276
Validation CSE: 1.2642135368428478
Train accuracy: 0.5037037037037037
Validation accuracy: 0.4666666666666667
Finished at epoch 45
Finished at epoch 46
Finished at epoch 48
Finished at epoch 49
Train for epoch:50, batch size:32, lr: 0.001
Finished at epoch 2
Finished at epoch 3
Epoch: 5
Train CSE: 1.7563460478277304
Validation CSE: 1.740513747978765
Train accuracy: 0.3396296296296296
Validation accuracy: 0.30333333333333334
Finished at epoch 6
Epoch: 10
Train CSE: 1.452402328184443
Validation CSE: 1.4023563296331027
Train accuracy: 0.4140740740740741
Validation accuracy: 0.41
Finishe

Epoch: 10
Train CSE: 1.6885959820131804
Validation CSE: 1.6844372235088294
Train accuracy: 0.3085185185185185
Validation accuracy: 0.2966666666666667
Finished at epoch 10
Finished at epoch 13
Epoch: 15
Train CSE: 1.406876675931674
Validation CSE: 1.4462365017106162
Train accuracy: 0.4285185185185185
Validation accuracy: 0.44
Finished at epoch 15
Finished at epoch 17
Finished at epoch 19
Epoch: 20
Train CSE: 1.5172577735819421
Validation CSE: 1.444234295390642
Train accuracy: 0.3925925925925926
Validation accuracy: 0.37666666666666665
Finished at epoch 21
Epoch: 25
Train CSE: 1.3637719441903804
Validation CSE: 1.3973343083249503
Train accuracy: 0.45740740740740743
Validation accuracy: 0.46
Finished at epoch 25
Finished at epoch 27
Finished at epoch 29
Epoch: 30
Train CSE: 1.337621749957585
Validation CSE: 1.3620743092843022
Train accuracy: 0.48777777777777775
Validation accuracy: 0.4766666666666667
Finished at epoch 30
Finished at epoch 32
Finished at epoch 33
Epoch: 35
Train CSE: 1.401

Finished at epoch 20
Finished at epoch 21
Finished at epoch 22
Finished at epoch 23
Finished at epoch 24
Epoch: 25
Train CSE: 1.793937001151329
Validation CSE: 1.7928294242580842
Train accuracy: 0.1685185185185185
Validation accuracy: 0.15
Finished at epoch 25
Finished at epoch 26
Finished at epoch 27
Finished at epoch 29
Epoch: 30
Train CSE: 1.7935075278746782
Validation CSE: 1.7935790994717407
Train accuracy: 0.1688888888888889
Validation accuracy: 0.14666666666666667
Finished at epoch 30
Finished at epoch 31
Finished at epoch 33
Finished at epoch 34
Epoch: 35
Train CSE: 1.7942080307186434
Validation CSE: 1.7937437523797135
Train accuracy: 0.1688888888888889
Validation accuracy: 0.14666666666666667
Finished at epoch 35
Finished at epoch 36
Finished at epoch 37
Finished at epoch 38
Finished at epoch 39
Epoch: 40
Train CSE: 1.7935836171940298
Validation CSE: 1.7922889247627165
Train accuracy: 0.1648148148148148
Validation accuracy: 0.18333333333333332
Finished at epoch 40
Finished at e

  del sys.path[0]
  del sys.path[0]


Epoch: 5
Train CSE: nan
Validation CSE: nan
Train accuracy: 0.1648148148148148
Validation accuracy: 0.18333333333333332
Epoch: 10
Train CSE: nan
Validation CSE: nan
Train accuracy: 0.1648148148148148
Validation accuracy: 0.18333333333333332
Epoch: 15
Train CSE: nan
Validation CSE: nan
Train accuracy: 0.1648148148148148
Validation accuracy: 0.18333333333333332
Epoch: 20
Train CSE: nan
Validation CSE: nan
Train accuracy: 0.1648148148148148
Validation accuracy: 0.18333333333333332
Epoch: 25
Train CSE: nan
Validation CSE: nan
Train accuracy: 0.1648148148148148
Validation accuracy: 0.18333333333333332
Epoch: 30
Train CSE: nan
Validation CSE: nan
Train accuracy: 0.1648148148148148
Validation accuracy: 0.18333333333333332
Epoch: 35
Train CSE: nan
Validation CSE: nan
Train accuracy: 0.1648148148148148
Validation accuracy: 0.18333333333333332
Epoch: 40
Train CSE: nan
Validation CSE: nan
Train accuracy: 0.1648148148148148
Validation accuracy: 0.18333333333333332
Epoch: 45
Train CSE: nan
Validati

  
  after removing the cwd from sys.path.


Epoch: 5
Train CSE: nan
Validation CSE: nan
Train accuracy: 0.1648148148148148
Validation accuracy: 0.18333333333333332
Epoch: 10
Train CSE: nan
Validation CSE: nan
Train accuracy: 0.1648148148148148
Validation accuracy: 0.18333333333333332
Epoch: 15
Train CSE: nan
Validation CSE: nan
Train accuracy: 0.1648148148148148
Validation accuracy: 0.18333333333333332
Epoch: 20
Train CSE: nan
Validation CSE: nan
Train accuracy: 0.1648148148148148
Validation accuracy: 0.18333333333333332
Epoch: 25
Train CSE: nan
Validation CSE: nan
Train accuracy: 0.1648148148148148
Validation accuracy: 0.18333333333333332
Epoch: 30
Train CSE: nan
Validation CSE: nan
Train accuracy: 0.1648148148148148
Validation accuracy: 0.18333333333333332
Epoch: 35
Train CSE: nan
Validation CSE: nan
Train accuracy: 0.1648148148148148
Validation accuracy: 0.18333333333333332
Epoch: 40
Train CSE: nan
Validation CSE: nan
Train accuracy: 0.1648148148148148
Validation accuracy: 0.18333333333333332
Epoch: 45
Train CSE: nan
Validati

In [10]:
for key in results_val.keys():
    print(key)
    print(max(results_val[key]))
    print('\n\n')

0.001-64-0.1
1.9940852492437633



0.001-64-0.5
1.8780726868903275



0.001-64-0.85
1.7145440360053714



0.001-256-0.1
1.8100379323398301



0.001-256-0.5
2.32945054184524



0.001-256-0.85
1.7697300723674827



0.01-64-0.1
2.420246716089482



0.01-64-0.5
1.8724131829825126



0.01-64-0.85
2.0117914293152457



0.01-256-0.1
1.8370247455558137



0.01-256-0.5
1.8292399644855877



0.01-256-0.85
2.0471627635086587



0.1-64-0.1
1.7943691973535358



0.1-64-0.5
1.797660165692455



0.1-64-0.85
nan



0.1-256-0.1
1.7939883285330431



0.1-256-0.5
1.796542325766361



0.1-256-0.85
nan





In [8]:
EPOCH_NUM = 50
BATCH_SIZE = 32
hid_state_size = 128
out_size = 6
T = 150

LR_arr = [0.0007]#, 0.001, 0.003]
hid_layer_size_arr = [64]#, 256]#16, 32, 64, 128, 256]
alpha_arr = [0.85]#, 0.50 ,0.85]

results = []
for LR in LR_arr:
    for hid_layer_size in hid_layer_size_arr:
        for alpha in alpha_arr:
            results.append(train(EPOCH_NUM, BATCH_SIZE, LR, alpha, hid_state_size, hid_layer_size, out_size, T, X_train, X_val, y_train, y_val))

Train for epoch:50, batch size:32, lr: 0.0007
Epoch: 1
Train CSE: 1.7568993256148897
Validation CSE: 1.7143201959817649
Train accuracy: 0.25703703703703706
Validation accuracy: 0.30666666666666664
Epoch: 2
Train CSE: 1.6236317789367483
Validation CSE: 1.5275835439326093
Train accuracy: 0.39185185185185184
Validation accuracy: 0.4533333333333333
Epoch: 3
Train CSE: 1.5073413936284807
Validation CSE: 1.4126988157479545
Train accuracy: 0.3837037037037037
Validation accuracy: 0.44666666666666666
Epoch: 4
Train CSE: 1.8579448520189328
Validation CSE: 1.899897661289443
Train accuracy: 0.17444444444444446
Validation accuracy: 0.10333333333333333
Epoch: 5
Train CSE: 1.8050076560805852
Validation CSE: 1.8165070418176053
Train accuracy: 0.13444444444444445
Validation accuracy: 0.10666666666666667
Epoch: 6
Train CSE: 1.7949555298721844
Validation CSE: 1.7730066417878758
Train accuracy: 0.18222222222222223
Validation accuracy: 0.14666666666666667
Epoch: 7
Train CSE: 1.7790319835357546
Validation C

In [19]:
len(results)

1

In [22]:
loaded = joblib.load('rnn_best.joblib')

In [24]:
train_cm,acc = confusion_matrix(X_test, y_test, 150, loaded[-1], loaded[-3], loaded[-2])
acc

Confusion matrix:

[[10.  8.  1. 12.  8.  7.]
 [ 7. 49.  0. 10.  9. 14.]
 [ 0.  0. 99.  0.  0.  0.]
 [66. 20.  0. 45. 44. 53.]
 [ 5. 11.  0. 22. 20. 12.]
 [12. 12.  0. 11. 19. 14.]]


0.395

In [21]:
import joblib
joblib.dump(results[0], 'rnn_best.joblib')

['rnn_best.joblib']

In [39]:
print(np.argmax(np.array(results[0][2])))
print(np.max(results[1][2]))
print(np.argmax(np.array(results[2][2])))


26
0.45666666666666667
37


In [19]:
EPOCH_NUM = 50
BATCH_SIZE = 32
hid_state_size = 128
out_size = 6
T = 150

LR_arr = [0.01]
hid_layer_size_arr = [32]#, 256]#16, 32, 64, 128, 256]
alpha_arr = [0.85]#, 0.50 ,0.85]

results_high_lr = []
for LR in LR_arr:
    for hid_layer_size in hid_layer_size_arr:
        for alpha in alpha_arr:
            results_high_lr.append(train(EPOCH_NUM, BATCH_SIZE, LR, alpha, hid_state_size, hid_layer_size, out_size, T, X_train, X_val, y_train, y_val))

Train for epoch:50, batch size:32, lr: 0.01
Finished at epoch 4
Epoch: 5
Train CSE: 1.847436245497743
Validation CSE: 1.8228520786700937
Train accuracy: 0.27925925925925926
Validation accuracy: 0.2833333333333333
Finished at epoch 7
Epoch: 10
Train CSE: 1.631730385608509
Validation CSE: 1.6128971368632192
Train accuracy: 0.29555555555555557
Validation accuracy: 0.28
Finished at epoch 12
Epoch: 15
Train CSE: 1.5179441683113315
Validation CSE: 1.521133055278508
Train accuracy: 0.2696296296296296
Validation accuracy: 0.2733333333333333
Finished at epoch 16
Finished at epoch 19
Epoch: 20
Train CSE: 1.5223285433266627
Validation CSE: 1.6265814621382262
Train accuracy: 0.29777777777777775
Validation accuracy: 0.28
Finished at epoch 21
Finished at epoch 22
Epoch: 25
Train CSE: 1.5059439962226298
Validation CSE: 1.4669163781343497
Train accuracy: 0.3425925925925926
Validation accuracy: 0.33666666666666667
Finished at epoch 25
Finished at epoch 26
Epoch: 30
Train CSE: 1.4792021502612887
Validat

In [20]:
max(results_high_lr[0][2])

0.41

In [80]:
best = 100
best_params = None
for key in results_val.keys():
    cur_max = min(results_val[key])
    if cur_max < best:
        best = cur_max
        best_params = key
best_params

'0.001-256-0'

In [27]:
best_W = results[2][-3]
best_b = results[2][-2]
best_h_init = results[2][-1]

train_cm, _ = confusion_matrix(X_train, y_train, 150, best_h_init, best_W, best_b)
test_cm, test_acc = confusion_matrix(X_test, y_test, 150, best_h_init, best_W, best_b)

Confusion matrix:

[[ 48.  11.   2.   5.  31.  17.]
 [ 51. 266.  19.  31.  99.  44.]
 [  7.   4. 394.   0.   9.   0.]
 [180.  66.   4. 379. 143. 164.]
 [ 69.  33.  19.  25.  61.  49.]
 [ 90.  71.   7.  15. 105. 182.]]
Confusion matrix:

[[  5.   3.   0.  11.   7.   2.]
 [  3.  34.   0.   1.   4.   2.]
 [  0.   1. 100.   1.   1.   0.]
 [ 77.  23.   0.  45.  54.  72.]
 [  8.  14.   0.   2.  13.   5.]
 [  7.  25.   0.  40.  21.  19.]]


In [38]:
np.sum(train_cm[1])

510.0

In [16]:
validation_loss = results[0][1]
validation_loss

[1.7143201959817649,
 1.5275835439326093,
 1.4126988157479545,
 1.899897661289443,
 1.8165070418176053,
 1.7730066417878758,
 1.7323610848604745,
 1.8276624292335142,
 1.7850515143132055,
 1.7511851158766407,
 1.7159569304400362,
 1.7150510495771956,
 1.7612529565722177,
 1.744342135664706,
 1.7096544157035907,
 1.704313532048405,
 1.7619309171890511,
 1.6660294556453168,
 1.7328838455281963,
 1.5402366311793423,
 1.4445078011052053,
 1.3851711976529293,
 1.3270561583189024,
 1.3202952162461645,
 1.4038876118286283,
 1.312813729257789,
 1.2860405045955106,
 1.4791200244715597,
 1.2389197259764957,
 1.2166063325775225,
 1.263874304373205,
 1.2168335401514676,
 1.4562509408595585,
 1.2247362833058653,
 1.4534318704448572,
 1.2569335380929871,
 1.2315057025393519,
 1.1759315883212538,
 1.1915781516840525,
 1.3066174281133636]

In [17]:
def plot_graph(funcs, labels, title, xlabel, ylabel):
    colors = ['black', 'blue', 'red', 'green']
    min_epoch = 50
    for func in funcs:
        if len(func) < min_epoch:
            min_epoch = len(func)
    for i, func in enumerate(funcs):
        plt.plot(range(min_epoch), func[:min_epoch], color=colors[i], label=labels[i])
    plt.xlabel(xlabel)
    plt.ylabel(ylabel)
    plt.title(title)
    plt.legend()
    plt.show()

In [18]:
plot_graph([validation_loss], ['Validation error'], 'Validation CSE over Epochs', 'Epoch', 'CSE Error')

In [10]:
def confusion_matrix(X, y, T, h_init, W, b): # Counts true if the true word is in the top ten predictions
    _, __, outputs = forward(X, h_init, W, b, T)
    correct = np.argmax(y, axis=1)
    predictions = np.argmax(outputs, axis=1)
    conf_matrix = np.zeros((6,6))
    for i, prediction in enumerate(predictions):
        conf_matrix[prediction, correct[i]] += 1
    print('Confusion matrix:\n')
    print(conf_matrix)
    return conf_matrix, np.sum(correct == predictions) / y.shape[0]

In [11]:
best_W = results[0][3]
best_b = results[0][4]
best_h = results[0][5]

In [23]:
cm, acc = confusion_matrix(X_test, y_test, 150, best_h, best_W, best_b)
acc

Confusion matrix:

[[10.  8.  1. 12.  8.  7.]
 [ 7. 49.  0. 10.  9. 14.]
 [ 0.  0. 99.  0.  0.  0.]
 [66. 20.  0. 45. 44. 53.]
 [ 5. 11.  0. 22. 20. 12.]
 [12. 12.  0. 11. 19. 14.]]


0.395

In [15]:
print(cm)

[[10.  8.  1. 12.  8.  7.]
 [ 7. 49.  0. 10.  9. 14.]
 [ 0.  0. 99.  0.  0.  0.]
 [66. 20.  0. 45. 44. 53.]
 [ 5. 11.  0. 22. 20. 12.]
 [12. 12.  0. 11. 19. 14.]]


In [82]:
EPOCH_NUM = 50
BATCH_SIZE = 32
LR = 0.006
hid_state_size = 128
hid_layer_size = 64
out_size = 6
alpha = 0.85
T = 150
check2(EPOCH_NUM, BATCH_SIZE, LR, alpha, hid_state_size, hid_layer_size, out_size, T, X_train, X_val, y_train, y_val)

1.3880597014925373
0.9999999977007771 5.507722389275216e-08 6.331740687315346e-17


In [41]:
def check(EPOCH_NUM, BATCH_SIZE, LR, alpha, hid_state_size, hid_layer_size, out_size, T, X_train, X_val, y_train, y_val):
    W, b = initialize_weights(3, hid_state_size, hid_layer_size, out_size) # initialize the weights
    h_init = np.random.uniform(low=-1, high=1, size=(1, hid_state_size))
   
    X = X_train[:32]
    y = y_train[:32]
    epsilon = 0.0000001
    
    W_epsilon = {}
    b_epsilon = {}
    b_plus = {}
    W_epsilon['h_h'] = W['h_h'] - epsilon
    W_epsilon['x_h'] = W['x_h']
    W_epsilon['1'] = W['1']
    W_epsilon['2'] = W['2']
    b_epsilon['h'] = b['h']
    b_epsilon['1'] = b['1']
    b_epsilon['2'] = b['2']
    h_epsilon = h_init
    W_plus = {}
    W_plus['h_h'] = W['h_h'] + epsilon
    W_plus['x_h'] = W['x_h']
    W_plus['1'] = W['1']
    W_plus['2'] = W['2']
    b_plus['h'] = b['h']
    b_plus['1'] = b['1']
    b_plus['2'] = b['2']
    h_plus = h_init
    H, hidden, O = forward(X, h_epsilon, W_epsilon, b_epsilon, T) # Forward pass
    J_1 = calculate_errors(O, y) # Loss calculation
    H, hidden, O = forward(X, h_plus, W_plus, b_plus, T) # Forward pass
    J_2 = calculate_errors(O, y) # Loss calculation
    H, hidden, O = forward(X, h_init, W, b, T) # Forward pass
    W_grad, b_grad, h_grad = backward(X, y, h_init, H, hidden, O, W, b, T) # Backpropagation algorithm
    expected = np.sum(W_grad['h_h'])
    real = (J_2 - J_1)/(2*epsilon)
    print((real-expected)/(real+expected))
    
    W_epsilon = {}
    b_epsilon = {}
    b_plus = {}
    W_epsilon['h_h'] = W['h_h'] 
    W_epsilon['x_h'] = W['x_h']- epsilon
    W_epsilon['1'] = W['1']
    W_epsilon['2'] = W['2']
    b_epsilon['h'] = b['h']
    b_epsilon['1'] = b['1']
    b_epsilon['2'] = b['2']
    h_epsilon = h_init
    W_plus = {}
    W_plus['h_h'] = W['h_h'] 
    W_plus['x_h'] = W['x_h']+ epsilon
    W_plus['1'] = W['1']
    W_plus['2'] = W['2']
    b_plus['h'] = b['h']
    b_plus['1'] = b['1']
    b_plus['2'] = b['2']
    h_plus = h_init
    H, hidden, O = forward(X, h_epsilon, W_epsilon, b_epsilon, T) # Forward pass
    J_1 = calculate_errors(O, y) # Loss calculation
    H, hidden, O = forward(X, h_plus, W_plus, b_plus, T) # Forward pass
    J_2 = calculate_errors(O, y) # Loss calculation
    H, hidden, O = forward(X, h_init, W, b, T) # Forward pass
    W_grad, b_grad, h_grad = backward(X, y, h_init, H, hidden, O, W, b, T) # Backpropagation algorithm
    expected = np.sum(W_grad['x_h'])
    real = (J_2 - J_1)/(2*epsilon)
    print((real-expected)/(real+expected))
    
    W_epsilon = {}
    b_epsilon = {}
    b_plus = {}
    W_epsilon['h_h'] = W['h_h'] 
    W_epsilon['x_h'] = W['x_h']
    W_epsilon['1'] = W['1']- epsilon
    W_epsilon['2'] = W['2']
    b_epsilon['h'] = b['h']
    b_epsilon['1'] = b['1']
    b_epsilon['2'] = b['2']
    h_epsilon = h_init
    W_plus = {}
    W_plus['h_h'] = W['h_h'] 
    W_plus['x_h'] = W['x_h']
    W_plus['1'] = W['1']+ epsilon
    W_plus['2'] = W['2']
    b_plus['h'] = b['h']
    b_plus['1'] = b['1']
    b_plus['2'] = b['2']
    h_plus = h_init
    H, hidden, O = forward(X, h_epsilon, W_epsilon, b_epsilon, T) # Forward pass
    J_1 = calculate_errors(O, y) # Loss calculation
    H, hidden, O = forward(X, h_plus, W_plus, b_plus, T) # Forward pass
    J_2 = calculate_errors(O, y) # Loss calculation
    H, hidden, O = forward(X, h_init, W, b, T) # Forward pass
    W_grad, b_grad, h_grad = backward(X, y, h_init, H, hidden, O, W, b, T) # Backpropagation algorithm
    expected = np.sum(W_grad['1'])
    real = (J_2 - J_1)/(2*epsilon)
    print((real-expected)/(real+expected))
    
    W_epsilon = {}
    b_epsilon = {}
    b_plus = {}
    W_epsilon['h_h'] = W['h_h'] 
    W_epsilon['x_h'] = W['x_h']
    W_epsilon['1'] = W['1']
    W_epsilon['2'] = W['2']- epsilon
    b_epsilon['h'] = b['h']
    b_epsilon['1'] = b['1']
    b_epsilon['2'] = b['2']
    h_epsilon = h_init
    W_plus = {}
    W_plus['h_h'] = W['h_h'] 
    W_plus['x_h'] = W['x_h']
    W_plus['1'] = W['1']
    W_plus['2'] = W['2']+ epsilon
    b_plus['h'] = b['h']
    b_plus['1'] = b['1']
    b_plus['2'] = b['2']
    h_plus = h_init
    H, hidden, O = forward(X, h_epsilon, W_epsilon, b_epsilon, T) # Forward pass
    J_1 = calculate_errors(O, y) # Loss calculation
    H, hidden, O = forward(X, h_plus, W_plus, b_plus, T) # Forward pass
    J_2 = calculate_errors(O, y) # Loss calculation
    H, hidden, O = forward(X, h_init, W, b, T) # Forward pass
    W_grad, b_grad, h_grad = backward(X, y, h_init, H, hidden, O, W, b, T) # Backpropagation algorithm
    expected = np.sum(W_grad['2'])
    real = (J_2 - J_1)/(2*epsilon)
    print((real-expected)/(real+expected))
    
    W_epsilon = {}
    b_epsilon = {}
    b_plus = {}
    W_epsilon['h_h'] = W['h_h'] 
    W_epsilon['x_h'] = W['x_h']
    W_epsilon['1'] = W['1']
    W_epsilon['2'] = W['2']
    b_epsilon['h'] = b['h']- epsilon
    b_epsilon['1'] = b['1']
    b_epsilon['2'] = b['2']
    h_epsilon = h_init
    W_plus = {}
    W_plus['h_h'] = W['h_h'] 
    W_plus['x_h'] = W['x_h']
    W_plus['1'] = W['1']
    W_plus['2'] = W['2']
    b_plus['h'] = b['h']+ epsilon
    b_plus['1'] = b['1']
    b_plus['2'] = b['2']
    h_plus = h_init
    H, hidden, O = forward(X, h_epsilon, W_epsilon, b_epsilon, T) # Forward pass
    J_1 = calculate_errors(O, y) # Loss calculation
    H, hidden, O = forward(X, h_plus, W_plus, b_plus, T) # Forward pass
    J_2 = calculate_errors(O, y) # Loss calculation
    H, hidden, O = forward(X, h_init, W, b, T) # Forward pass
    W_grad, b_grad, h_grad = backward(X, y, h_init, H, hidden, O, W, b, T) # Backpropagation algorithm
    expected = np.sum(b_grad['h'])
    real = (J_2 - J_1)/(2*epsilon)
    print((real-expected)/(real+expected))
    
    W_epsilon = {}
    b_epsilon = {}
    b_plus = {}
    W_epsilon['h_h'] = W['h_h'] 
    W_epsilon['x_h'] = W['x_h']
    W_epsilon['1'] = W['1']
    W_epsilon['2'] = W['2']
    b_epsilon['h'] = b['h']
    b_epsilon['1'] = b['1']- epsilon
    b_epsilon['2'] = b['2']
    h_epsilon = h_init
    W_plus = {}
    W_plus['h_h'] = W['h_h'] 
    W_plus['x_h'] = W['x_h']
    W_plus['1'] = W['1']
    W_plus['2'] = W['2']
    b_plus['h'] = b['h']
    b_plus['1'] = b['1']+ epsilon
    b_plus['2'] = b['2']
    h_plus = h_init
    H, hidden, O = forward(X, h_epsilon, W_epsilon, b_epsilon, T) # Forward pass
    J_1 = calculate_errors(O, y) # Loss calculation
    H, hidden, O = forward(X, h_plus, W_plus, b_plus, T) # Forward pass
    J_2 = calculate_errors(O, y) # Loss calculation
    H, hidden, O = forward(X, h_init, W, b, T) # Forward pass
    W_grad, b_grad, h_grad = backward(X, y, h_init, H, hidden, O, W, b, T) # Backpropagation algorithm
    expected = np.sum(b_grad['1'])
    real = (J_2 - J_1)/(2*epsilon)
    print((real-expected)/(real+expected))
    
    W_epsilon = {}
    b_epsilon = {}
    b_plus = {}
    W_epsilon['h_h'] = W['h_h'] 
    W_epsilon['x_h'] = W['x_h']
    W_epsilon['1'] = W['1']
    W_epsilon['2'] = W['2']
    b_epsilon['h'] = b['h']
    b_epsilon['1'] = b['1']
    b_epsilon['2'] = b['2']- epsilon
    h_epsilon = h_init
    W_plus = {}
    W_plus['h_h'] = W['h_h'] 
    W_plus['x_h'] = W['x_h']
    W_plus['1'] = W['1']
    W_plus['2'] = W['2']
    b_plus['h'] = b['h']
    b_plus['1'] = b['1']
    b_plus['2'] = b['2']+ epsilon
    h_plus = h_init
    H, hidden, O = forward(X, h_epsilon, W_epsilon, b_epsilon, T) # Forward pass
    J_1 = calculate_errors(O, y) # Loss calculation
    H, hidden, O = forward(X, h_plus, W_plus, b_plus, T) # Forward pass
    J_2 = calculate_errors(O, y) # Loss calculation
    H, hidden, O = forward(X, h_init, W, b, T) # Forward pass
    W_grad, b_grad, h_grad = backward(X, y, h_init, H, hidden, O, W, b, T) # Backpropagation algorithm
    expected = np.sum(b_grad['2'])
    real = (J_2 - J_1)/(2*epsilon)
    print((real-expected)/(real+expected))
    
    W_epsilon = {}
    b_epsilon = {}
    b_plus = {}
    W_epsilon['h_h'] = W['h_h'] 
    W_epsilon['x_h'] = W['x_h']
    W_epsilon['1'] = W['1']
    W_epsilon['2'] = W['2']
    b_epsilon['h'] = b['h']
    b_epsilon['1'] = b['1']
    b_epsilon['2'] = b['2']
    h_epsilon = h_init - epsilon
    W_plus = {}
    W_plus['h_h'] = W['h_h'] 
    W_plus['x_h'] = W['x_h']
    W_plus['1'] = W['1']
    W_plus['2'] = W['2']
    b_plus['h'] = b['h']
    b_plus['1'] = b['1']
    b_plus['2'] = b['2']
    h_plus = h_init + epsilon
    H, hidden, O = forward(X, h_epsilon, W_epsilon, b_epsilon, T) # Forward pass
    J_1 = calculate_errors(O, y) # Loss calculation
    H, hidden, O = forward(X, h_plus, W_plus, b_plus, T) # Forward pass
    J_2 = calculate_errors(O, y) # Loss calculation
    H, hidden, O = forward(X, h_init, W, b, T) # Forward pass
    W_grad, b_grad, h_grad = backward(X, y, h_init, H, hidden, O, W, b, T) # Backpropagation algorithm
    expected = np.sum(h_grad)
    real = (J_2 - J_1)/(2*epsilon)
    print((real-expected)/(real+expected))

In [95]:
def check2(EPOCH_NUM, BATCH_SIZE, LR, alpha, hid_state_size, hid_layer_size, out_size, T, X_train, X_val, y_train, y_val):
    W, b = initialize_weights(3, hid_state_size, hid_layer_size, out_size) # initialize the weights
    h_init = np.random.uniform(low=-1, high=1, size=(1, hid_state_size))
   
    X = X_train[:32]
    y = y_train[:32]
    epsilon = 0.0000001
    
    W_epsilon = {}
    b_epsilon = {}
    b_plus = {}
    W_epsilon['h_h'] = W['h_h'] 
    W_epsilon['x_h'] = W['x_h']
    W_epsilon['1'] = W['1']
    W_epsilon['2'] = W['2']- epsilon
    b_epsilon['h'] = b['h']
    b_epsilon['1'] = b['1']
    b_epsilon['2'] = b['2']
    h_epsilon = h_init
    W_plus = {}
    W_plus['h_h'] = W['h_h'] 
    W_plus['x_h'] = W['x_h']
    W_plus['1'] = W['1']
    W_plus['2'] = W['2']+ epsilon
    b_plus['h'] = b['h']
    b_plus['1'] = b['1']
    b_plus['2'] = b['2']
    h_plus = h_init
    H, hidden, O = forward(X, h_epsilon, W_epsilon, b_epsilon, T) # Forward pass
    J_1 = calculate_errors(O, y) # Loss calculation
    H, hidden, O = forward(X, h_plus, W_plus, b_plus, T) # Forward pass
    J_2 = calculate_errors(O, y) # Loss calculation
    H, hidden, O = forward(X, h_init, W, b, T) # Forward pass
    W_grad, b_grad, h_grad = backward(X, y, h_init, H, hidden, O, W, b, T) # Backpropagation algorithm
    expected = np.sum(W_grad['2'])
    real = (J_2 - J_1)/(2*epsilon)
    print((real-expected)/(real+expected))
    
    W_epsilon = {}
    b_epsilon = {}
    b_plus = {}
    W_epsilon['h_h'] = W['h_h'] 
    W_epsilon['x_h'] = W['x_h']
    W_epsilon['1'] = W['1'] 
    W_epsilon['2'] = W['2']
    b_epsilon['h'] = b['h']
    b_epsilon['1'] = b['1']
    b_epsilon['2'] = b['2']
    h_epsilon = h_init - epsilon
    W_plus = {}
    W_plus['h_h'] = W['h_h'] 
    W_plus['x_h'] = W['x_h']
    W_plus['1'] = W['1'] 
    W_plus['2'] = W['2']
    b_plus['h'] = b['h']
    b_plus['1'] = b['1']
    b_plus['2'] = b['2']
    h_plus = h_init + epsilon
    H, hidden, O = forward(X, h_epsilon, W_epsilon, b_epsilon, T) # Forward pass
    J_1 = calculate_errors(O, y) # Loss calculation
    H, hidden, O = forward(X, h_plus, W_plus, b_plus, T) # Forward pass
    J_2 = calculate_errors(O, y) # Loss calculation
    H, hidden, O = forward(X, h_init, W, b, T) # Forward pass
    W_grad, b_grad, h_grad = backward(X, y, h_init, H, hidden, O, W, b, T) # Backpropagation algorithm
    expected = np.sum(h_grad)
    real = (J_2 - J_1)/(2*epsilon)
    print((real-expected)/(real+expected), real, expected)
    

In [96]:
EPOCH_NUM = 50
BATCH_SIZE = 32
LR = 0.006
hid_state_size = 128
hid_layer_size = 64
out_size = 6
alpha = 0.85
T = 150
check2(EPOCH_NUM, BATCH_SIZE, LR, alpha, hid_state_size, hid_layer_size, out_size, T, X_train, X_val, y_train, y_val)

-1.0
-6.601731803673805e-05 -1.2002621119222567e-05 -1.2004205985562597e-05
