In [None]:
import os
curr_dir = os.getcwd()

## Filepath

In [None]:
word_vectors_filepath = os.path.join(curr_dir,'word_vector','word_vector.txt').replace('\\','/')
movie_queries_test_text_filepath = os.path.join(curr_dir,'test_set','movie_queries_test_text.txt').replace('\\','/')
index_to_target_filepath = os.path.join(curr_dir,'index_converter','index_to_target.txt').replace('\\','/')
target_to_index_filepath = os.path.join(curr_dir,'index_converter','target_to_index.txt').replace('\\','/')
best_weights_filepath = os.path.join(curr_dir,'model_training_weights','weights.222.hdf5').replace('\\','/')
best_hyperparams_info_filepath = os.path.join(curr_dir,'random_search_data','best_hyperparameter_info.txt').replace('\\','/')

## Imports

In [None]:
import requests
import pickle
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.data import load
import numpy as np
import pandas as pd
import string
import re 
from keras import backend as k
from keras.models import Sequential, Model, load_model
from keras.layers import Dense, LSTM, Input, concatenate, TimeDistributed, Bidirectional, Masking
from keras_contrib.layers import CRF
from keras_contrib.metrics import crf_viterbi_accuracy, crf_accuracy
from keras_contrib.losses import crf_loss
from keras.optimizers import Adam  
from keras.preprocessing.sequence import pad_sequences
from keras.callbacks import EarlyStopping, ModelCheckpoint, Callback
from keras.preprocessing.text import text_to_word_sequence
from keras.utils import to_categorical
from sklearn.model_selection import train_test_split, ParameterGrid, ParameterSampler, GridSearchCV
from sklearn.metrics import f1_score
from sklearn.utils import shuffle
from keras.wrappers.scikit_learn import KerasClassifier
import tensorflow as tf

### You only need to run the cell below once, you can delete the cell below and across all notebooks

In [None]:
nltk.download('tagsets')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

## Getting Test Text

In [None]:
with open(movie_queries_test_text_filepath, "rb") as t:
    test_text = pickle.load(t)

## Configurations

* Add the functions needed for cleaning the raw text
* Set a dictionary for the index to be converted back into the target name once prediction has been done
* Initialize the input sequence

In [None]:
#Functions to remove . from numbers found using regex
def remove_decimal(number):
    return number.group(0).replace('.','')

#Functions to remove , from numbers found using regex
def remove_comma(number):
    return number.group(0).replace(',','')

def base_model(units=50, optimizer='Adam', hidden_layers=2, activation_td ='relu', dropout=0.1, recurrent_dropout=0.1):
    hidden_layers_stored = {}
    counter=1
    input = Input(shape=(80,95))
    mask = Masking(mask_value=0.)(input)
    for hl in range(hidden_layers):
        if counter==1:
            hidden_layers_stored['hl_{}'.format(counter)] = Bidirectional(LSTM(units=units, return_sequences=True, dropout=dropout, recurrent_dropout=recurrent_dropout))(mask)  
        else:
            hidden_layers_stored['hl_{}'.format(counter)] = Bidirectional(LSTM(units=units, return_sequences=True, dropout=dropout, recurrent_dropout=recurrent_dropout))(hidden_layers_stored['hl_{}'.format(counter-1)])
        counter+=1
    model_last_layer = TimeDistributed(Dense(50, activation=activation_td))(hidden_layers_stored['hl_{}'.format(counter-1)])  
    crf = CRF(25)  
    out = crf(model_last_layer)  
    model_final = Model(input, out)
    model_final.compile(optimizer=optimizer, loss=crf_loss, metrics=[crf_accuracy])
    return model_final

#The following combination of hyperparameters is found to work quite well, which is why i added it in. If the random cv result
#is to be used please comment out the next line and uncomment the 2 lines following it
best_hyperparameter_info = ['dummy',{'units_hyperparams': 100, 'recurrent_dropout_hyperparams': 0.3, 'optimizer_hyperparams': 'Adadelta', 'hidden_layers_hyperparams': 1, 'epochs_hyperparams': 250, 'dropout_hyperparams': 0.2, 'batch_size_hyperparams': 32}]
#with open(best_hyperparams_info_filepath, "rb") as t:
    #best_hyperparameter_info = pickle.load(t)

#GPU Options are added to prevent this file from taking up all the GPU. You can remove it if this is the only file you are running.
graph1 = tf.Graph()
with graph1.as_default():
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    config.log_device_placement = True
    session1 = tf.Session(config=config)
    with session1.as_default():
        model1 = base_model(units=best_hyperparameter_info[1]['units_hyperparams'],optimizer=best_hyperparameter_info[1]['optimizer_hyperparams'],hidden_layers=best_hyperparameter_info[1]['hidden_layers_hyperparams'],dropout=best_hyperparameter_info[1]['dropout_hyperparams'],recurrent_dropout=best_hyperparameter_info[1]['recurrent_dropout_hyperparams'])
        model1.load_weights(best_weights_filepath)
        model1._make_predict_function()

In [None]:
#Dictionary to convert categories to index
with open(index_to_target_filepath, "rb") as t:
    index_to_targets = pickle.load(t)

In [None]:
index_to_targets

In [None]:
#Initialize input_sequence
input_sequence = []

In [None]:
#Load all possible pos tags
tagdict = load('help/tagsets/upenn_tagset.pickle')
all_pos = list(tagdict.keys())

all_pos_tags = []
for pos in all_pos:
    all_pos_tags.append('pos_'+pos)

In [None]:
all_pos

## Input text to be extracted

Here, I am using the test text provided by the MIT website which I have scraped

In [None]:
text = test_text[5]

text = "Input a movie search query"

## Find Numbers

Decimal points from numbers will be removed during text cleaning as these decimal points could be mistaken as fullstops by the sentence tokenizer which would result in inaccurate sentence tokenization.

The numbers list acts as a reference to replace the unformatted decimal numbers with their original format.

In [None]:
text

In [None]:
#Find numbers
text_find_numbers = text
text_find_numbers = re.sub('[^a-zA-Z0-9.\s]+','',text_find_numbers) 
numbers = re.findall('\d*\.?\d+',text_find_numbers)

In [None]:
numbers

In [None]:
#Text cleaning
#Characters of words that are not letters, numbers or punctuation are removed as this will impact sentence tokenization,
#word tokenization and word vectorization
text = " ".join(text.splitlines())
text = re.sub('[^a-zA-Z0-9.,\s]+','',text) 
text = re.sub('\s+', ' ', text).strip()
text = re.sub('\d*\.?\d+',remove_decimal,text)
text = re.sub('\d*\,?\d+',remove_comma,text)

In [None]:
text

In [None]:
#Break paragraph into sentences, then break sentence into words and add pos tags to each word of a sentence 
sentences = sent_tokenize(text)
sent_num = 0
pos_dict = {}
for sentence in sentences:
    sent_num += 1
    pos_dict[sent_num] = nltk.pos_tag(word_tokenize(sentence))

In [None]:
pos_dict

In [None]:
#Remove stray . , from words. If the word value of the tuple only contains . , then remove it
for key,value in pos_dict.items():
    cleaned = []
    for pos_tuple in value:
        word_value, tag_value = pos_tuple
        checked = re.sub('[^a-zA-Z0-9]+','',word_value) 
        if len(checked)!=len(word_value):
            print('Found stray punctuation')
            print('Original: {}'.format(word_value))
            print('New: {}'.format(checked))
            print('')
            print('#################')
        if len(checked) == 0 :
            continue
        else:
            pos_tuple = tuple([checked, tag_value])
            cleaned.append(pos_tuple)

    pos_dict[key] = cleaned

In [None]:
pos_dict

In [None]:
#Create dataframe with corresponding sentence number, word, and part of speech columns
L = [(k, *t) for k, v in pos_dict.items() for t in v]
df = pd.DataFrame(L, columns=['sentence_no','word','pos'])

#Use later on
df_for_prediction = df.copy()

In [None]:
df

In [None]:
#Get list of words from dataframe
tokenized_text = df['word'].tolist()

In [None]:
tokenized_text

In [None]:
#Word Vectorization API
word_vector_api_data = tokenized_text
session = requests.Session()
session.trust_env = False
session.post('http://127.0.0.1:5000/word_vectorization', json = word_vector_api_data) #add proxies args if needed

In [None]:
with open(word_vectors_filepath, "rb") as t:
    word_vectors = pickle.load(t)

In [None]:
#Add word featues to dataframe
df['word_vec'] = word_vectors
df = pd.get_dummies(df, columns=['pos'])

In [None]:
#Find pos columns needed to be added
df_cols = list(df.columns)
add_pos_col = [add for add in all_pos_tags if add not in df_cols]

#Add missing pos columns
for added_pos in add_pos_col:
    df[added_pos] = 0

#Rearrange columns in fixed order for consistency in training data set
arrange_df_cols = ['sentence_no','word','word_vec']
for arrange_pos in all_pos_tags:
    arrange_df_cols.append(arrange_pos)
df = df.reindex(columns=arrange_df_cols)

In [None]:
df

In [None]:
#Get the sentence feature vectors. Each sentence feature vector contains a list of all its word feature vectors.
df = df.drop(columns=['word'])
sentence_feature_vectors = {}
for index,row in df.iterrows():
    sentence_number = row[0]
    word_feature_vector = np.concatenate((row[1:]), axis = None)
    if sentence_number in sentence_feature_vectors.keys():
        sentence_feature_vectors[sentence_number].append(word_feature_vector)
    else:
        sentence_feature_vectors[sentence_number] = [word_feature_vector]

In [None]:
sentence_feature_vectors

In [None]:
#Pad length for sentences and append to the input_sequence 
dummy_length = len(sentence_feature_vectors[1][0])
for sentence in sentence_feature_vectors.values():
    while len(sentence) < 80:
        sentence.append(np.array([0 for zero in range(dummy_length)]))
        
    input_sequence.append(np.array(sentence))

x = np.array(input_sequence)

In [None]:
x

In [None]:
x.shape

## Prediction for all entities 

`np.argmax` is used to get the index of the tag with the highest probability in the arrays

In [None]:
#WITHOUT np.argmax
with session1.as_default():
    result_dummy = model1.predict(x)

print('Without np argmax the output sequence is a 3D output where each word contains one hot encodings: {}'.format(result_dummy.shape))
result_dummy

In [None]:
#Predict y values using x values and convert integer y to its correct entity. This is for all the entities except for location
with session1.as_default():
    prediction = np.argmax(model1.predict(x), axis=-1)
    
print('With np argmax the output sequence is a 2D output where each word is an index: {}'.format(prediction.shape))
prediction

In [None]:
session1.close()

#### Convert the index to its respective target

In [None]:
#Convert the index of the prediction to its respective target tag
predicted_tag = [[index_to_targets[i] for i in row] for row in prediction]

In [None]:
predicted_tag

<br>

#### Now that we have the predicted tags we need to get their respective words by getting the word in the same position as the target in its array

#### Create a list of lists of the words that has the same shape as predicted_tag list of lists

#### Create a dictionary where each key is a sentence number and each value is the words of that sentence
<br>
<br>

In [None]:
#Create a dictionary with sentence number as keys and the values as the words of the sentence 
sentences = {}

for index,row in df_for_prediction.iterrows():
    sentence_number = row[0]
    word = row[1]
    if sentence_number in sentences.keys():
        sentences[sentence_number].append(word)
    else:
        sentences[sentence_number] = [word]

In [None]:
sentences

<br>

#### Add padding to each word list as the prediction contains padding too
<br>

In [None]:
word_sequence = []
for sentence in sentences.values():
    while len(sentence) < 80:
        sentence.append('padding')
    word_sequence.append(sentence)

In [None]:
word_sequence

<br>

#### Adding formatting back to numbers that are supposed to have decimal points

However, some numbers may still be attached to numbers such as '4th' as recognized by the word tokenizer and so we should not separate it. The same goes for numbers such as '200+'
<br>
<br>

```python
    re.findall('\d*\.?\d+',word)
``` 
searches for a number in the current word 

```python
    re.search('[a-zA-Z+]', word)
```
searches for text and plus sign in the word. 

If both text and numbers are present we do not replace anything
<br>
<br>

In [None]:
#Outer loop loops over sentences
#Inner loop loops over words in the sentence

#Add decimal points back to numbers that have them
counter = 0
for sentence in word_sequence:
    curr_index = 0
    for word in sentence:
        
        #Check if all the formatted numbers have been iterated through
        if counter < len(numbers):
            
            #If a number is found and it is equal to the number in 
            #the numbers list with the decimal point removed
            if re.findall('\d*\.?\d+',word) == [numbers[counter].replace('.','')]:
                
                #If a word or plus is found in the number do not replace anything, 
                #move on to the next number
                if re.search('[a-zA-Z+]', word):
                    counter+=1
                else:
                    
                    #replace the number with its correct formatting
                    sentence.pop(curr_index)
                    sentence.insert(curr_index, numbers[counter])
                    counter += 1
        curr_index += 1

In [None]:
word_sequence

#### Initialize an empty dictionary to store the results

In [None]:
#Dictionary to convert categories to index
with open(target_to_index_filepath, "rb") as t:
    old_result_dict = pickle.load(t)

In [None]:
old_result_dict

In [None]:
for k,v in old_result_dict.items():
    old_result_dict[k]=[]

In [None]:
#Initialize result dictionary
old_result_dict

### Getting the respective words of the targets and storing the results

Outer loop loops through sentences, inner loop loops through the sentence

Getting the corresponding word of the target now that both arrays have the exact same shape

In [None]:
#Outer loop loops through sentences, Inner loop loops through the sentence

#set sentence counter to 0
sent_counter = 0

#iterating over sentences
for sentence_prediction in predicted_tag:
    
    #set word counter to 0
    word_counter = 0
    for single_prediction in sentence_prediction:
        #if the target is not O
        if single_prediction != 'O':
            
            print('Predicted tag found: {}'.format(single_prediction))
            print('Its position in the word sequence is')
            print('    Sentence Number: {} , Index: {}'.format(sent_counter,word_counter))
            
            #Add the corresponding word of the predicted label to the old_result_dict using the correct subsets
            old_result_dict[single_prediction].append(word_sequence[sent_counter][word_counter])
            
            print('Added word: {} from word sequence using subsets {}, {}'.format(word_sequence[sent_counter][word_counter],sent_counter,word_counter))
            print(' ')
        
        word_counter+=1
    sent_counter+=1

In [None]:
old_result_dict

As each label has B-, I- tags, we need to consolidate the variations of the labels to its root label. i.e B-ACTOR and I-ACTOR should be considered as ACTOR.

We iterate through the dictionary and extract out the root label, and add the value

In [None]:
result_dict = {}
for k,v in old_result_dict.items():
    if k!='O':
        new_key = k.split('-')[1]
    else:
        new_key = 'O'
    if new_key not in result_dict.keys():
        result_dict[new_key] = v
    else:
        result_dict[new_key].extend(v)

In [None]:
result_dict

We now join all the values in the arrays to make it cleaner

In [None]:
result_dict_clean = {}
for k,v in result_dict.items():
    result_dict_clean[k] = " ".join(v)

In [None]:
result_dict_clean

In [None]:
result_df = pd.DataFrame.from_dict(result_dict_clean, orient='index')
result_df = result_df.transpose()

In [None]:
result_df = result_df.replace(to_replace=[None], value='')

In [None]:
result_df