# Unit tests
Since the expressive power of Neural network should be at least comparable to our wonderful simple bi-gram model, we are suggested to run tests to see where the performance diverges

Esentially:
* Generate some test data that we can reason about what the accuracy SHOULD be
* Test the bigram and the network
* See if things are as expected 
* If no, find out why
* If yes, try to come up with a new test
* Since these are meant to be just quick checks, no need to cross validate.

## Imports and functions/models to test

In [1]:
#Imports
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import tensorflow as tf
import seaborn as sns
sns.set()
sns.set_style("whitegrid")

from tensorflow.keras import *
from tensorflow.keras.models import *
from tensorflow.keras.layers import *
from tensorflow.keras.callbacks import *

from chord_functions import *

  from ._conv import register_converters as _register_converters


In [2]:
def numerator(test):
    '''
    Given sample of ngrams
    Returns a marix where each entry is the number of times
    given sequence is followed by given chord in training data
    
    '''
    num_occ=np.zeros((test.shape[0],len(chords)))
    for i in range(0,len(chords)):
        chord=np.repeat(chords[i],(test.shape[0],))
        chord=chord.reshape(test.shape[0],1)
        temp=np.hstack([test, chord]) 
        num_occ[:,i]=denominator(ngram,temp)
    return num_occ
        


In [3]:
def denominator(train,test):
    '''
    Given train and test samples of ngram sequences, 
    calculates the number of times each of the sequences in test occur in train.
    Returns an array containing occurence number of each ngram sample in test
    
    '''
    occ=np.zeros(test.shape[0])
    for i in range(0,test.shape[0]):
        bool_=(train==test[i])
        occ[i]=np.sum(np.sum(bool_,axis=1)==test.shape[1])
    return occ     

In [4]:
def lstm(lstm_x, lstm_y):
    model = Sequential()
    
    model.add(LSTM(256, return_sequences=True, input_shape=(lstm_x.shape[1], lstm_x.shape[2])))
    
    model.add(Dropout(0.3))

    model.add(LSTM(64, return_sequences=False))
    
    model.add(Dropout(0.3))
    
    model.add(Dense(lstm_y.shape[1], activation='softmax'))

    model.compile(loss='categorical_crossentropy',
                  optimizer='Adam',
                  metrics=['accuracy'])
    return model

In [5]:
# fix random seed for reproducibility
seed = 1
np.random.seed(seed)

#Load all data
DF = pd.read_csv('data/820chords.csv')
DF = DF[['chord']]

# Cases

### Case 1: All  chord sequences are the same
accuracy should be 100% for both models

In [6]:
#Generate baseline input
data = pd.DataFrame()
for i in range(50):
    data = data.append(DF.iloc[0])
    data = data.append(DF.iloc[1])

#Define split position
split = round(len(data)*0.7)

#### NN

In [7]:
#Format data for LSTM
n = 4
dummies = pd.get_dummies(data)
lstm_x, lstm_y = generate_sequences(dummies, dummies, n)

lstm_x_train = lstm_x[:split]
lstm_x_valid = lstm_x[split:]

lstm_y_train = lstm_y[:split]
lstm_y_valid = lstm_y[split:]

In [8]:
#Train NN
model = lstm(lstm_x_train, lstm_y_train)
model.fit(lstm_x_train,
          lstm_y_train,
          epochs = 10,
          validation_data = (lstm_x_valid, lstm_y_valid),
          verbose = 0)

NN_result = pd.DataFrame(model.history.history)

#### NGRAM

In [9]:
#Format data for ngram
train = data[:split]
valid = data[split:]

n = 2
chords=data.chord.unique()
corpus=train.chord.values
ngram= []

#N gram generation: training
for i in range(0,len(corpus) - (n +1)):
    in_data = corpus[i:n + i]
    ngram.append(in_data)
    
ngram=np.array(ngram)
train_x=ngram[:,:n-1]
train_y=ngram[:,-1]

#N gram generation: validation
corpus_val=valid.chord.values
ngram_val=[]
for i in range(0,len(corpus_val) - (n +1)):
    in_data =corpus_val[i:n + i]
    ngram_val.append(in_data)
ngram_val=np.array(ngram_val)
val_x=ngram_val[:,:n-1]
val_y=ngram_val[:,-1]

In [10]:
#Train NGRAM

a=numerator(val_x)+1
b=denominator(train_x,val_x)+len(chords)
max_ind=np.zeros(val_x.shape[0])
max_ind=np.argmax(a.T/b.T,axis=0)
pred=np.empty((val_x.shape[0],1),dtype="<U10")
for i in range(0,len(pred)):
    pred[i]=chords[max_ind[i]]
result = np.sum(pred.flatten()==val_y)/len(pred)

In [11]:
print("Comparison of validation accuracy")
print("Neural network: {}".format(NN_result.tail(1).val_acc.values[0]))
print("N-gram        : {}".format(result))

Comparison of validation accuracy
Neural network: 1.0
N-gram        : 1.0


### Case 2: