In [None]:
import os
curr_dir = os.getcwd()

## Filepath

In [None]:
training_sets_filepath = os.path.join(curr_dir,'training_set','movie_queries_training_dataset.csv').replace('\\','/')
word_vectors_filepath = os.path.join(curr_dir,'word_vector','word_vector.txt').replace('\\','/')
target_to_index_filepath = os.path.join(curr_dir,'index_converter','target_to_index.txt').replace('\\','/')
random_search_hist_filepath = os.path.join(curr_dir,'random_search_data','random_search_hist.txt').replace('\\','/')
random_search_hyperparams_filepath = os.path.join(curr_dir,'random_search_data','random_search_hyperparams.txt').replace('\\','/')
best_hyperparams_info_filepath = os.path.join(curr_dir,'random_search_data','best_hyperparameter_info.txt').replace('\\','/')

## Imports

In [None]:
import requests
import pickle
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.data import load
import numpy as np
import pandas as pd
import string
import re 
from keras import backend as k
from keras.models import Sequential, Model, load_model
from keras.layers import Dense, LSTM, Input, concatenate, TimeDistributed, Bidirectional, Masking
from keras_contrib.layers import CRF
from keras_contrib.metrics import crf_viterbi_accuracy, crf_accuracy
from keras_contrib.losses import crf_loss
from keras.optimizers import Adam  
from keras.preprocessing.sequence import pad_sequences
from keras.callbacks import EarlyStopping, ModelCheckpoint, Callback
from keras.preprocessing.text import text_to_word_sequence
from keras.utils import to_categorical
from sklearn.model_selection import train_test_split, ParameterGrid, ParameterSampler, GridSearchCV
from sklearn.metrics import f1_score
from sklearn.utils import shuffle
from keras.wrappers.scikit_learn import KerasClassifier

### You only need to run the cell below once, you can delete the cell below and across all notebooks

In [None]:
nltk.download('tagsets')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

## Configurations

In [None]:
#Dictionary to convert categories to index
with open(target_to_index_filepath, "rb") as t:
    target_to_index = pickle.load(t)
    
f1_labels = list(target_to_index.values())
f1_labels.pop(-1)

#input_sequence for sentences, output_sequence for targets of sentences
input_sequence = []
output_sequence = []

#Store grid search results
random_search_hist = {}

def base_model(units=50, optimizer='Adam', hidden_layers=2, activation_td ='relu', dropout=0.1, recurrent_dropout=0.1):
    hidden_layers_stored = {}
    counter=1
    #k.clear_session() is necessary here to save memory
    k.clear_session()
    input = Input(shape=(x.shape[1],x.shape[-1]))
    mask = Masking(mask_value=0.)(input)
    for hl in range(hidden_layers):
        if counter==1:
            hidden_layers_stored['hl_{}'.format(counter)] = Bidirectional(LSTM(units=units, return_sequences=True, dropout=dropout, recurrent_dropout=recurrent_dropout))(mask)  
        else:
            hidden_layers_stored['hl_{}'.format(counter)] = Bidirectional(LSTM(units=units, return_sequences=True, dropout=dropout, recurrent_dropout=recurrent_dropout))(hidden_layers_stored['hl_{}'.format(counter-1)])
        counter+=1
    model_last_layer = TimeDistributed(Dense(50, activation=activation_td))(hidden_layers_stored['hl_{}'.format(counter-1)])  
    crf = CRF(25)  
    out = crf(model_last_layer)  
    model_final = Model(input, out)
    model_final.compile(optimizer=optimizer, loss=crf_loss, metrics=[crf_accuracy])
    return model_final

#Initialize the hyperparameter number for random cv
hyperparam_number = 0

In [None]:
#Load all possible pos tags
tagdict = load('help/tagsets/upenn_tagset.pickle')
all_pos = list(tagdict.keys())

all_pos_tags = []
for pos in all_pos:
    all_pos_tags.append('pos_'+pos)

## Prepare training data 

In [None]:
#Read pre-processed dataset for training
df = pd.read_csv(training_sets_filepath)
df_target = df.copy()

#Get list of words from dataframe
tokenized_text = df['word'].tolist()

word_vector_api_data = tokenized_text
session = requests.Session()
session.trust_env = False
session.post('http://127.0.0.1:5000/word_vectorization', json = word_vector_api_data) #add proxies args if needed

with open(word_vectors_filepath, "rb") as t:
    word_vectors = pickle.load(t)

#Add word featues to dataframe
df['word_vec'] = word_vectors
df = pd.get_dummies(df, columns=['pos'])

#Add all pos columns and rearrange in fixed order for consistency
df_cols = list(df.columns)
add_pos_col = [add for add in all_pos_tags if add not in df_cols]

for added_pos in add_pos_col:
    df[added_pos] = 0

arrange_df_cols = ['sentence_no','word','word_vec']
for arrange_pos in all_pos_tags:
    arrange_df_cols.append(arrange_pos)
df = df.reindex(columns=arrange_df_cols)

#Get the sentence feature vectors. Each sentence contains a list of all its word feature vectors.
df = df.drop(columns=['word'])
sentence_feature_vectors = {}
for index,row in df.iterrows():
    sentence_number = row[0]
    word_feature_vector = np.concatenate((row[1:]), axis = None)
    if sentence_number in sentence_feature_vectors.keys():
        sentence_feature_vectors[sentence_number].append(word_feature_vector)
    else:
        sentence_feature_vectors[sentence_number] = [word_feature_vector]

#Pad length for sentences and append to the input_sequence 
dummy_length = len(sentence_feature_vectors[1][0])
for sentence in sentence_feature_vectors.values():
    while len(sentence) < 80:
        sentence.append(np.array([0 for zero in range(dummy_length)]))

    input_sequence.append(np.array(sentence))

#Add the target for each word of the sentence
targets = {}
for index,row in df_target.iterrows():
    sentence_number = row[1]
    word_target = row[-1]
    if sentence_number in targets.keys():
        targets[sentence_number].append(word_target)
    else:
        targets[sentence_number] = [word_target]

#Pad length for sentences and append to output_sequence
for sentence in targets.values():
    sentence = [target_to_index[target] for target in sentence]
    while len(sentence) < 80:
        sentence.append(target_to_index['O'])

    output_sequence.append(np.array(sentence))

In [None]:
x = np.array(input_sequence)
y = np.array(output_sequence)
y = to_categorical(y, num_classes=25)
x_s,y_s = shuffle(x,y,random_state=42)

## Continue random cv

1. Load the whole list of parameters and the random search hist
2. Remove all the parameters that have been tried
3. Set the parameter number to continue from where we left off

In [None]:
with open(random_search_hyperparams_filepath, "rb") as t:
    hyperparams_list = pickle.load(t)

In [None]:
with open(random_search_hist_filepath, "rb") as t:
    random_search_hist = pickle.load(t)

In [None]:
for i in range(len(random_search_hist)):
    hyperparams_list.pop(0)

In [None]:
hyperparam_number = len(random_search_hist)

## Random Search Cross Validation (CV)

Find the best combination of hyperparameters for the algorithm in order for it to best learn from our use case.

The random search finds sets of random combination of hyperparameters for the model to iteratively test out. 

Cross validation is then used to evaluate the performance of the model. Cross validation splits the data into an arbitrary number of sets. For instance, it splits the data into 5 sets. On the first run it will be trained on set 1 to 4 and tested on 5. Next, it will be trained on set 2 to 5 and tested on 1, so on and so forth. 

This is effective in validating performance as it is tested on multiple sets of unseen data. If the model performs well during cross validation, chances are that it has learnt patterns that generalize well to our use case as it can predict unseen data well.

**Random Search vs Grid Search**

* Random Search is more feasible as grid search will run over **every** parameter aka, 1800 parameters! This will take up a very long time and thus, unfeasible. 
* Proven that it has a 95% probability of finding a combination of hyperparameters within the top 5% best performing combinations using 60 iterations.

Libraries such as keras and sklearn do not provide random search cv for 3 dimensional inputs and outputs so custom random search cv code needs to be written.

1. Generate a random set (45 sets) of parameters using sklearn's ParameterSampler class

2. Manually split the data into 5 folds using subsetting 

3. Write code to train on 4 folds of training data and test on the last one, repeating this process 5 times where the test set is a new fold every iteration and the rest of the folds are the training data. The performance metric/result of the test set, f1 score, is saved to a list at every iteration. At the end of the iteration, the evaluated set of parameter, the average of the f1 scores and all the f1 scores are added to the dictionary

*a condition is put to stop the cross validation for a set of parameter if its f1 score is below 0.7 after 2 iterations to save time


### Selecting a measure of performance

F1 Score Vs Accuracy

Accuracy is a misleading indicator for imbalanced datasets. In our case 'O' entities makes up majority of all entities

Example:

* Positive class = Location
* Negative class = Non Location


Accuracy: (Number of Correct Predictions) / (Total Number of Predictions), (1 + 90)/(100), 91% 

We aim to predict location entities, however out of the 9 location entities we only predicted 1 correctly. Thus accuracy creates the illusion that our model performs very well by taking the correct predictions of the majority negative class into account.

<br>
<br>

Precision: (Number of Correct Positive Predictions) / (Total Number of Positive Predictions), (1)/(1+1), 50%

Preicison measures the proportion of positive predictions made that are correct. 

<br>
<br>

Recall: (Number of Correct Positive Predictions) / (Total Number of Positives), (1)/(1+8), 11%

Recall measures the proportion of actual positives that were captured

<br>

F1 score is a combination of precision and recall, and is a much better reflection of a model's performance for predicting the positive class, which is the class we are aiming to predict

The f1 macro score is used as it calculates the f1 score for each label and averages them

In [None]:
batch_size_hyperparams = [16, 32, 64, 128, 256]
epochs_hyperparams = [30,50,80,100,150,200,250]
units_hyperparams = [50, 100, 150, 200]
optimizer_hyperparams = ['RMSprop', 'Adadelta', 'Adam', 'Adamax', 'Nadam']
hidden_layers_hyperparams = [1, 2, 3]
dropout_hyperparams = [0.1,0.2,0.3]
recurrent_dropout_hyperparams = [0.1,0.2,0.3]

### Generating Hyperparameters

In [None]:
hyperparams_grid = dict(batch_size_hyperparams=batch_size_hyperparams, epochs_hyperparams=epochs_hyperparams, units_hyperparams=units_hyperparams, optimizer_hyperparams=optimizer_hyperparams, hidden_layers_hyperparams = hidden_layers_hyperparams, dropout_hyperparams=dropout_hyperparams, recurrent_dropout_hyperparams=recurrent_dropout_hyperparams)
print("Random set of hyperparameters to generate from:")
hyperparams_grid

In [None]:
random_hyperparams = ParameterSampler(hyperparams_grid, n_iter=60) 
hyperparams_list = list(random_hyperparams)

In [None]:
hyperparams_list[0]

The following combination of parameters is found to work quite well, which is why I added it in. You can remove it if you want to start afresh.

In [None]:
{'units_hyperparams': 100, 'recurrent_dropout_hyperparams': 0.3, 'optimizer_hyperparams': 'Adadelta', 'hidden_layers_hyperparams': 1, 'epochs_hyperparams': 250, 'dropout_hyperparams': 0.2, 'batch_size_hyperparams': 32} in hyperparams_list

In [None]:
hyperparams_list[0]={'units_hyperparams': 100, 'recurrent_dropout_hyperparams': 0.3, 'optimizer_hyperparams': 'Adadelta', 'hidden_layers_hyperparams': 1, 'epochs_hyperparams': 250, 'dropout_hyperparams': 0.2, 'batch_size_hyperparams': 32}

In [None]:
print("Random set of hyperparameters:")
hyperparams_list

In [None]:
with open(random_search_hyperparams_filepath, "wb") as t:
    pickle.dump(hyperparams_list, t)

In [None]:
x_s.shape

In [None]:
y_s.shape

### Setting cross validation sets

In [None]:
x_1 = x_s[:1955]
x_2 = x_s[1955:3910]
x_3 = x_s[3910:5865]
x_4 = x_s[5865:7820]
x_5 = x_s[7820:]

In [None]:
print("Each fold contains approx 1955 sentences:")
x_1.shape

In [None]:
y_1 = y_s[:1955]
y_2 = y_s[1955:3910]
y_3 = y_s[3910:5865]
y_4 = y_s[5865:7820]
y_5 = y_s[7820:]

In [None]:
x_cv = [x_1,x_2,x_3,x_4,x_5]
y_cv = [y_1,y_2,y_3,y_4,y_5]

In [None]:
for hyperparam in hyperparams_list:
    #If this is the first hyperparameter in the random search, don't load the random search history file 
    #because there won't be a file yet
    if hyperparam_number!=0:
        with open(random_search_hist_filepath, "rb") as t:
            random_search_hist = pickle.load(t)
            
    #Stores information regarding the random cv of the hyperparameter
    
    temp_list = []
    for cv in range(5):
        print('#############################')
        #Initializing the model with the hyperparameters to be evaluated
        my_model = base_model(units=hyperparam['units_hyperparams'], optimizer=hyperparam['optimizer_hyperparams'], hidden_layers = hyperparam['hidden_layers_hyperparams'], dropout = hyperparam['dropout_hyperparams'], recurrent_dropout = hyperparam['recurrent_dropout_hyperparams'])
        
        #Selecting the x training set i.e. 4 training sets other than the training set at the current index, and
        #respective y training set
        select_xtraining_set = [train for i,train in enumerate(x_cv) if i!=cv]
        xtrain_1 = select_xtraining_set[0]
        xtrain_2 = select_xtraining_set[1]
        xtrain_3 = select_xtraining_set[2]
        xtrain_4 = select_xtraining_set[3]
        xset_1 = np.append(xtrain_1,xtrain_2,axis=0)
        xset_2 = np.append(xset_1,xtrain_3,axis=0)
        xtraining_set = np.append(xset_2,xtrain_4,axis=0)
        
        select_ytraining_set = [train for i,train in enumerate(y_cv) if i!=cv]
        ytrain_1 = select_ytraining_set[0]
        ytrain_2 = select_ytraining_set[1]
        ytrain_3 = select_ytraining_set[2]
        ytrain_4 = select_ytraining_set[3]
        yset_1 = np.append(ytrain_1,ytrain_2,axis=0)
        yset_2 = np.append(yset_1,ytrain_3,axis=0)
        ytraining_set = np.append(yset_2,ytrain_4,axis=0)
        
        #Selecting the x testing set i.e. the training set at the current index, and
        #respective y training set
        xtest_set = [test for i,test in enumerate(x_cv) if i==cv][0]
        y_true = [test for i,test in enumerate(y_cv) if i==cv][0]
        
        #Train the model on the training data
        my_model.fit(xtraining_set, ytraining_set, epochs=hyperparam['epochs_hyperparams'], batch_size=hyperparam['batch_size_hyperparams'])
        
        #Calculate f1
        #Get the prediciton on the test set and reshape both y sets so that it would be 2D 
        #as sklearn's f1 evaluation only accepts 2D inputs. Just all the words and 
        #their corresponding targets, not split into sentences.
        y_pred = my_model.predict(xtest_set)
        yshape_true = y_true.shape
        yshape_pred = y_pred.shape
        y_true_newshape = (yshape_true[0]*yshape_true[1], yshape_true[-1])
        y_pred_newshape = (yshape_pred[0]*yshape_pred[1], yshape_pred[-1])
        y_true_reshaped = np.reshape(y_true, y_true_newshape)
        y_pred_reshaped = np.reshape(y_pred, y_pred_newshape)
        try:
            temp_list.append(f1_score(y_true_reshaped, y_pred_reshaped, average = 'macro', labels=f1_labels))
            print(param)
            print(' ')
            print(f1_score(y_true_reshaped, y_pred_reshaped, average = 'macro', labels=f1_labels))
            print(' ')
        except:
            print('Predicted NaN')
            print(' ')
            temp_list.append(0)
            
        #On the first cross validation if the score is below 0.6 go to the next hyperparameter
        if cv==0 and np.average(temp_list)<0.7:
            break

    calculated_f1 = np.average(temp_list)
    print('F1 score')
    print(param)
    print(calculated_f1)
    print(' ')
    random_search_hist[hyperparam_number] = [hyperparam, calculated_f1, temp_list]
    
    #Dump the random_searh_dict in case of any unforseen circumstances to save progress
    with open(random_search_hist_filepath, "wb") as t:
        pickle.dump(random_search_hist, t)
        
    hyperparam_number+=1

### Find best hyperparameters

In [None]:
best_hyperparameter_info = []
for key,value in random_search_hist.items():
    final_score = value[1]
    if key==0:
        best_hyperparameter_info = [final_score,random_search_hist[key][0]]
    elif final_score > best_parameter_info[0]:
        best_hyperparameter_info = [final_score,random_search_hist[key][0]]

In [None]:
best_hyperparameter_info

In [None]:
with open(best_hyperparams_info_filepath, "wb") as t:
    pickle.dump(best_hyperparameter_info, t)