In [1]:
import os
curr_dir = os.getcwd()

## Filepath

In [2]:
word_vectors_filepath = os.path.join(curr_dir,'word_vector','word_vector.txt').replace('\\','/')
movie_queries_test_text_filepath = os.path.join(curr_dir,'test_set','movie_queries_test_text.txt').replace('\\','/')
index_to_target_filepath = os.path.join(curr_dir,'index_converter','index_to_target.txt').replace('\\','/')
target_to_index_filepath = os.path.join(curr_dir,'index_converter','target_to_index.txt').replace('\\','/')
best_weights_filepath = os.path.join(curr_dir,'model_training_weights','weights.best.hdf5').replace('\\','/')
mass_predictor_results_filepath = os.path.join(curr_dir,'mass_predictor_results','predicted_result_{}.csv').replace('\\','/')

## Imports

In [3]:
import requests
import pickle
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.data import load
import numpy as np
import pandas as pd
import string
import re 
from keras import backend as k
from keras.models import Sequential, Model, load_model
from keras.layers import Dense, LSTM, Input, concatenate, TimeDistributed, Bidirectional, Masking
from keras_contrib.layers import CRF
from keras_contrib.metrics import crf_viterbi_accuracy, crf_accuracy
from keras_contrib.losses import crf_loss
from keras.optimizers import Adam  
from keras.preprocessing.sequence import pad_sequences
from keras.callbacks import EarlyStopping, ModelCheckpoint, Callback
from keras.preprocessing.text import text_to_word_sequence
from keras.utils import to_categorical
from sklearn.model_selection import train_test_split, ParameterGrid, ParameterSampler, GridSearchCV
from sklearn.metrics import f1_score
from sklearn.utils import shuffle
from keras.wrappers.scikit_learn import KerasClassifier
import tensorflow as tf

Using TensorFlow backend.
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


### You only need to run the cell below once, you can delete the cell below and across all notebooks

In [4]:
nltk.download('tagsets')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package tagsets to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package tagsets is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

## Configurations

In [5]:
#Define function to create base model dynamically
#I used a dictionary and formatting to add hidden layers dynamically
def base_model(units=50, optimizer='Adam', hidden_layers=2, activation_td ='relu', dropout=0.1, recurrent_dropout=0.1):
    hidden_layers_stored = {}
    counter=1
    input = Input(shape=(80,95))
    mask = Masking(mask_value=0.)(input)
    for hl in range(hidden_layers):
        if counter==1:
            hidden_layers_stored['hl_{}'.format(counter)] = Bidirectional(LSTM(units=units, return_sequences=True, dropout=dropout, recurrent_dropout=recurrent_dropout))(mask)  
        else:
            hidden_layers_stored['hl_{}'.format(counter)] = Bidirectional(LSTM(units=units, return_sequences=True, dropout=dropout, recurrent_dropout=recurrent_dropout))(hidden_layers_stored['hl_{}'.format(counter-1)])
        counter+=1
    model_last_layer = TimeDistributed(Dense(50, activation=activation_td))(hidden_layers_stored['hl_{}'.format(counter-1)])  
    crf = CRF(25)  
    out = crf(model_last_layer)  
    model_final = Model(input, out)
    model_final.compile(optimizer=optimizer, loss=crf_loss, metrics=[crf_accuracy])
    return model_final

#### Getting best hyperparameters of model

Change accordingly if needed. The following combination of hyperparameters is found to work quite well, which is why i added it in. If you wish to use the best hyperparameter info from your own random search cv, please change the cell below this cell to a markdown cell and the one below that to a code cell

In [6]:
best_hyperparameter_info = ['dummy',{'units_hyperparams': 100, 'recurrent_dropout_hyperparams': 0.3, 'optimizer_hyperparams': 'Adadelta', 'hidden_layers_hyperparams': 1, 'epochs_hyperparams': 250, 'dropout_hyperparams': 0.2, 'batch_size_hyperparams': 32}]

with open(best_hyperparams_info_filepath, "rb") as t:
    best_hyperparameter_info = pickle.load(t)

#### Initializing predictive model - graph, session, model

In [7]:
#GPU Options are added to prevent the program from taking up all the computer GPU's memory when initializing the model
#for prediction
graph_masspredictor = tf.Graph()
with graph_masspredictor.as_default():
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    session_masspredictor = tf.Session(config=config)
    with session_masspredictor.as_default():
        model_masspredictor = base_model(units=best_hyperparameter_info[1]['units_hyperparams'],optimizer=best_hyperparameter_info[1]['optimizer_hyperparams'],hidden_layers=best_hyperparameter_info[1]['hidden_layers_hyperparams'],dropout=best_hyperparameter_info[1]['dropout_hyperparams'],recurrent_dropout=best_hyperparameter_info[1]['recurrent_dropout_hyperparams'])
        model_masspredictor.load_weights(best_weights_filepath)
        model_masspredictor._make_predict_function()

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.


In [8]:
#Functions to remove . and , from numbers found using regex
def remove_decimal(number):
    return number.group(0).replace('.','')

def remove_comma(number):
    return number.group(0).replace(',','')

#Dictionary to convert index to categories
with open(index_to_target_filepath, "rb") as t:
    index_to_targets = pickle.load(t)

#Dictionary to hold all results
results = {}

results_counter = 0

In [9]:
#Load all possible pos tags
tagdict = load('help/tagsets/upenn_tagset.pickle')
all_pos = list(tagdict.keys())

all_pos_tags = []
for pos in all_pos:
    all_pos_tags.append('pos_'+pos)

In [10]:
with open(movie_queries_test_text_filepath, "rb") as t:
    test_text = pickle.load(t)

## Input texts to be extracted

In [11]:
test_text = test_text[:20]

test_text = #make sure this is a list of movie search queries

# Create Input data

In [12]:
for text in test_text:
    
    #Initialize input_sequence
    input_sequence = []
    
    #Find numbers
    text_find_numbers = text
    text_find_numbers = re.sub('[^a-zA-Z0-9.\s]+','',text_find_numbers) 
    numbers = re.findall('\d*\.?\d+',text_find_numbers)

    #Text pre-processing
    text = " ".join(text.splitlines())
    text = re.sub('[^a-zA-Z0-9.,\s+]+','',text) 
    text = re.sub('\s+', ' ', text).strip()
    text = re.sub('\d*\.?\d+',remove_decimal,text)
    text = re.sub('\d*\,?\d+',remove_comma,text)
    
    #Break text into sentences then break sentence into words and add pos tags to each word of the sentence
    sentences = sent_tokenize(text)
    sent_num = 0
    pos_dict = {}
    for sentence in sentences:
        sent_num += 1
        pos_dict[sent_num] = nltk.pos_tag(word_tokenize(sentence))
    
    #Remove stray . , from words. If the word value of the tuple only contains . , then remove it
    for key,value in pos_dict.items():
        cleaned = []
        for pos_tuple in value:
            word_value, tag_value = pos_tuple
            checked = re.sub('[^a-zA-Z0-9]+','',word_value) 
            if len(checked) == 0 :
                continue
            else:
                pos_tuple = tuple([checked, tag_value])
                cleaned.append(pos_tuple)

        pos_dict[key] = cleaned

    #Create dataframe with corresponding sentence number, word, and part of speech columns
    L = [(k, *t) for k, v in pos_dict.items() for t in v]
    df = pd.DataFrame(L, columns=['sentence_no','word','pos'])
    
    #Use later on
    df_for_prediction = df.copy()

    #Get list of words from dataframe. Create another list where words are all lower cased.
    tokenized_text = df['word'].tolist()

    word_vector_api_data = tokenized_text
    session = requests.Session()
    session.trust_env = False
    session.post('http://127.0.0.1:5000/word_vectorization', json = word_vector_api_data) #add proxies args if needed
    
    with open(word_vectors_filepath, "rb") as t:
        word_vectors = pickle.load(t)
    
    #Add word featues to dataframe
    df['word_vec'] = word_vectors
    df = pd.get_dummies(df, columns=['pos'])

    #Add all pos columns and rearrange in fixed order for consistency
    df_cols = list(df.columns)
    add_pos_col = [add for add in all_pos_tags if add not in df_cols]

    for added_pos in add_pos_col:
        df[added_pos] = 0

    arrange_df_cols = ['sentence_no','word','word_vec']
    for arrange_pos in all_pos_tags:
        arrange_df_cols.append(arrange_pos)
    df = df.reindex(columns=arrange_df_cols)

    #Get the sentence feature vectors. Each sentence contains a list of all its word feature vectors.
    df = df.drop(columns=['word'])
    sentence_feature_vectors = {}
    for index,row in df.iterrows():
        sentence_number = row[0]
        word_feature_vector = np.concatenate((row[1:]), axis = None)
        if sentence_number in sentence_feature_vectors.keys():
            sentence_feature_vectors[sentence_number].append(word_feature_vector)
        else:
            sentence_feature_vectors[sentence_number] = [word_feature_vector]
   
    #Pad length for sentences and append to the input_sequence 
    dummy_length = len(sentence_feature_vectors[1][0])
    for sentence in sentence_feature_vectors.values():
        while len(sentence) < 80:
            sentence.append(np.array([0 for zero in range(dummy_length)]))
            
        input_sequence.append(np.array(sentence))
        
    x = np.array(input_sequence)
    
    
    #Predict y values using x values and convert integer y to its correct entity
    with session_masspredictor.as_default():
        prediction = np.argmax(model_masspredictor.predict(x), axis=-1)
    predicted_tag = [[index_to_targets[i] for i in row] for row in prediction]
    
    #Generate a padded word sequence for each sentence
    sentences = {}
    word_sequence = []
    
    for index,row in df_for_prediction.iterrows():
        sentence_number = row[0]
        word = row[1]
        if sentence_number in sentences.keys():
            sentences[sentence_number].append(word)
        else:
            sentences[sentence_number] = [word]

    for sentence in sentences.values():
        while len(sentence) < 80:
            sentence.append('padding')
        word_sequence.append(sentence)
    
    #Add decimal points back to numbers that have them
    counter = 0
    for sentence in word_sequence:
        curr_index = 0
        for word in sentence:
            if counter < len(numbers):
                if re.findall('\d*\.?\d+',word) == [numbers[counter].replace('.','')]:
                    if re.search('[a-zA-Z+]', word):
                        counter+=1
                    else:
                        sentence.pop(curr_index)
                        sentence.insert(curr_index, numbers[counter])
                        counter += 1
            curr_index += 1
    
    
    with open(target_to_index_filepath, "rb") as t:
        old_result_dict = pickle.load(t)
        
    for k,v in old_result_dict.items():
        old_result_dict[k]=[]
    
    #set sentence counter to 0
    sent_counter = 0
    for sentence_prediction in predicted_tag:
        word_counter = 0
        for single_prediction in sentence_prediction:
            if single_prediction != 'O':
                old_result_dict[single_prediction].append(word_sequence[sent_counter][word_counter])
            word_counter+=1
        sent_counter+=1
        
    result_dict = {}
    for k,v in old_result_dict.items():
        if k!='O':
            new_key = k.split('-')[1]
        else:
            new_key = 'O'
        if new_key not in result_dict.keys():
            result_dict[new_key] = v
        else:
            result_dict[new_key].extend(v)
            
    result_dict_clean = {}
    for k,v in result_dict.items():
        result_dict_clean[k] = " ".join(v)
    
    result_df = pd.DataFrame.from_dict(result_dict_clean, orient='index')
    result_df = result_df.transpose()
    results['df_{}'.format(results_counter)] = result_df.replace(to_replace=[None], value='')
    results_counter+=1

In [13]:
for i in range(20):
    results['df_{}'.format(i)].to_csv(mass_predictor_results_filepath.format(i))

## Some Test Text Results

### Result 1

In [14]:
test_text[0]

'are there any good romantic comedies out right now'

In [15]:
results['df_0']

Unnamed: 0,O,ACTOR,CHARACTER,DIRECTOR,GENRE,PLOT,RATING,RATINGS_AVERAGE,REVIEW,SONG,TITLE,TRAILER,YEAR
0,,,,,romantic comedies,,,,,,,,


### Result 2

In [16]:
test_text[1]

'show me a movie about cars that talk'

In [17]:
results['df_1']

Unnamed: 0,O,ACTOR,CHARACTER,DIRECTOR,GENRE,PLOT,RATING,RATINGS_AVERAGE,REVIEW,SONG,TITLE,TRAILER,YEAR
0,,,,,,cars,,,,,,,


### Result 3

In [18]:
test_text[2]

'list the five star rated movies starring mel gibson'

In [19]:
results['df_2']

Unnamed: 0,O,ACTOR,CHARACTER,DIRECTOR,GENRE,PLOT,RATING,RATINGS_AVERAGE,REVIEW,SONG,TITLE,TRAILER,YEAR
0,,mel gibson,,,,,,five,,,,,
