In [1]:
import os
curr_dir = os.getcwd()

## Filepath

In [2]:
word_vectors_filepath = os.path.join(curr_dir,'word_vector','word_vector.txt').replace('\\','/')
movie_queries_test_text_filepath = os.path.join(curr_dir,'test_set','movie_queries_test_text.txt').replace('\\','/')
index_to_target_filepath = os.path.join(curr_dir,'index_converter','index_to_target.txt').replace('\\','/')
target_to_index_filepath = os.path.join(curr_dir,'index_converter','target_to_index.txt').replace('\\','/')
best_weights_filepath = os.path.join(curr_dir,'model_training_weights','weights.best.hdf5').replace('\\','/')
best_hyperparams_info_filepath = os.path.join(curr_dir,'random_search_data','best_hyperparameter_info.txt').replace('\\','/')

## Imports

In [3]:
import requests
import pickle
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.data import load
import numpy as np
import pandas as pd
import string
import re 
from keras import backend as k
from keras.models import Sequential, Model, load_model
from keras.layers import Dense, LSTM, Input, concatenate, TimeDistributed, Bidirectional, Masking
from keras_contrib.layers import CRF
from keras_contrib.metrics import crf_viterbi_accuracy, crf_accuracy
from keras_contrib.losses import crf_loss
from keras.optimizers import Adam  
from keras.preprocessing.sequence import pad_sequences
from keras.callbacks import EarlyStopping, ModelCheckpoint, Callback
from keras.preprocessing.text import text_to_word_sequence
from keras.utils import to_categorical
from sklearn.model_selection import train_test_split, ParameterGrid, ParameterSampler, GridSearchCV
from sklearn.metrics import f1_score
from sklearn.utils import shuffle
from keras.wrappers.scikit_learn import KerasClassifier
import tensorflow as tf

Using TensorFlow backend.
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


### You only need to run the cell below once, you can delete the cell below and across all notebooks

In [4]:
nltk.download('tagsets')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package tagsets to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package tagsets is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

## Getting Test Text

In [5]:
with open(movie_queries_test_text_filepath, "rb") as t:
    test_text = pickle.load(t)

## Configurations

* Add the functions needed for cleaning the raw text
* Set a dictionary for the index to be converted back into the target name once prediction has been done
* Initialize the input sequence

In [6]:
#Functions to remove . from numbers found using regex
def remove_decimal(number):
    return number.group(0).replace('.','')

#Functions to remove , from numbers found using regex
def remove_comma(number):
    return number.group(0).replace(',','')

#Define function to create base model dynamically
#I used a dictionary and formatting to add hidden layers dynamically
def base_model(units=50, optimizer='Adam', hidden_layers=2, activation_td ='relu', dropout=0.1, recurrent_dropout=0.1):
    hidden_layers_stored = {}
    counter=1
    input = Input(shape=(80,95))
    mask = Masking(mask_value=0.)(input)
    for hl in range(hidden_layers):
        if counter==1:
            hidden_layers_stored['hl_{}'.format(counter)] = Bidirectional(LSTM(units=units, return_sequences=True, dropout=dropout, recurrent_dropout=recurrent_dropout))(mask)  
        else:
            hidden_layers_stored['hl_{}'.format(counter)] = Bidirectional(LSTM(units=units, return_sequences=True, dropout=dropout, recurrent_dropout=recurrent_dropout))(hidden_layers_stored['hl_{}'.format(counter-1)])
        counter+=1
    model_last_layer = TimeDistributed(Dense(50, activation=activation_td))(hidden_layers_stored['hl_{}'.format(counter-1)])  
    crf = CRF(25)  
    out = crf(model_last_layer)  
    model_final = Model(input, out)
    model_final.compile(optimizer=optimizer, loss=crf_loss, metrics=[crf_accuracy])
    return model_final

#### Getting best hyperparameters of model

In [7]:
with open(best_hyperparams_info_filepath, "rb") as t:
    best_hyperparameter_info = pickle.load(t)

#### Initializing predictive model - graph, session, model

In [8]:
#GPU Options are added to prevent the program from taking up all the computer GPU's memory when initializing the model
#for prediction
graph_predictor = tf.Graph()
with graph_predictor.as_default():
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    session_predictor = tf.Session(config=config)
    with session_predictor.as_default():
        model_predictor = base_model(units=best_hyperparameter_info[1]['units_hyperparams'],optimizer=best_hyperparameter_info[1]['optimizer_hyperparams'],hidden_layers=best_hyperparameter_info[1]['hidden_layers_hyperparams'],dropout=best_hyperparameter_info[1]['dropout_hyperparams'],recurrent_dropout=best_hyperparameter_info[1]['recurrent_dropout_hyperparams'])
        model_predictor.load_weights(best_weights_filepath)
        model_predictor._make_predict_function()

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.


In [9]:
#Dictionary to convert index to categories - convert the vectors output from our predicted array to its respective target labels
#so that it is readable
with open(index_to_target_filepath, "rb") as t:
    index_to_targets = pickle.load(t)

In [10]:
index_to_targets

{0: 'O',
 1: 'B-ACTOR',
 2: 'B-CHARACTER',
 3: 'B-DIRECTOR',
 4: 'B-GENRE',
 5: 'B-PLOT',
 6: 'B-RATING',
 7: 'B-RATINGS_AVERAGE',
 8: 'B-REVIEW',
 9: 'B-SONG',
 10: 'B-TITLE',
 11: 'B-TRAILER',
 12: 'B-YEAR',
 13: 'I-ACTOR',
 14: 'I-CHARACTER',
 15: 'I-DIRECTOR',
 16: 'I-GENRE',
 17: 'I-PLOT',
 18: 'I-RATING',
 19: 'I-RATINGS_AVERAGE',
 20: 'I-REVIEW',
 21: 'I-SONG',
 22: 'I-TITLE',
 23: 'I-TRAILER',
 24: 'I-YEAR'}

In [11]:
#Initialize input_sequence
input_sequence = []

In [12]:
#Load all possible pos tags
tagdict = load('help/tagsets/upenn_tagset.pickle')
all_pos = list(tagdict.keys())

all_pos_tags = []
for pos in all_pos:
    all_pos_tags.append('pos_'+pos)

In [13]:
all_pos

['LS',
 'TO',
 'VBN',
 "''",
 'WP',
 'UH',
 'VBG',
 'JJ',
 'VBZ',
 '--',
 'VBP',
 'NN',
 'DT',
 'PRP',
 ':',
 'WP$',
 'NNPS',
 'PRP$',
 'WDT',
 '(',
 ')',
 '.',
 ',',
 '``',
 '$',
 'RB',
 'RBR',
 'RBS',
 'VBD',
 'IN',
 'FW',
 'RP',
 'JJR',
 'JJS',
 'PDT',
 'MD',
 'VB',
 'WRB',
 'NNP',
 'EX',
 'NNS',
 'SYM',
 'CC',
 'CD',
 'POS']

## Input text to be extracted

Here, I am using the test text provided by the MIT website which I have scraped. You may change this cell below this to a markdown cell and the cell that says "Input a movie search query" to a code cell if you wish to test using your own text

In [14]:
text = test_text[5]

text = "Input a movie search query"

## Find Numbers

Decimal points from numbers will be removed during text cleaning as these decimal points could be mistaken as fullstops by the sentence tokenizer which would result in inaccurate sentence tokenization.

The numbers list acts as a reference to replace the unformatted decimal numbers with their original format.

In [15]:
text

'show me 1980s action movies'

In [16]:
#Find numbers
text_find_numbers = text
text_find_numbers = re.sub('[^a-zA-Z0-9.\s]+','',text_find_numbers) 
numbers = re.findall('\d*\.?\d+',text_find_numbers)

In [17]:
numbers

['1980']

## Text Cleaning for raw text

Characters of words that are not letters, numbers or punctuation are removed as this will impact sentence tokenization,
word tokenization and word vectorization

In [18]:
#Text cleaning
#Characters of words that are not letters, numbers or punctuation are removed as this will impact sentence tokenization,
#word tokenization and word vectorization
text = " ".join(text.splitlines())
text = re.sub('[^a-zA-Z0-9.,\s]+','',text) 
text = re.sub('\s+', ' ', text).strip()
text = re.sub('\d*\.?\d+',remove_decimal,text)
text = re.sub('\d*\,?\d+',remove_comma,text)

In [19]:
text

'show me 1980s action movies'

In [20]:
#Break paragraph into sentences, then break sentence into words and add pos tags to each word of a sentence 
sentences = sent_tokenize(text)
sent_num = 0
pos_dict = {}
for sentence in sentences:
    sent_num += 1
    pos_dict[sent_num] = nltk.pos_tag(word_tokenize(sentence))

In [21]:
pos_dict

{1: [('show', 'VB'),
  ('me', 'PRP'),
  ('1980s', 'CD'),
  ('action', 'NN'),
  ('movies', 'NNS')]}

In [22]:
#Remove stray . , from words using re sub. If the word value of the tuple only contains . , then remove it
for key,value in pos_dict.items():
    cleaned = []
    for pos_tuple in value:
        word_value, tag_value = pos_tuple
        checked = re.sub('[^a-zA-Z0-9]+','',word_value) 
        if len(checked)!=len(word_value):
            print('Found stray punctuation')
            print('Original: {}'.format(word_value))
            print('New: {}'.format(checked))
            print('')
            print('#################')
        if len(checked) == 0 :
            continue
        else:
            pos_tuple = tuple([checked, tag_value])
            cleaned.append(pos_tuple)

    pos_dict[key] = cleaned

In [23]:
pos_dict

{1: [('show', 'VB'),
  ('me', 'PRP'),
  ('1980s', 'CD'),
  ('action', 'NN'),
  ('movies', 'NNS')]}

In [24]:
#Create dataframe with corresponding sentence number, word, and part of speech columns
L = [(k, *t) for k, v in pos_dict.items() for t in v]
df = pd.DataFrame(L, columns=['sentence_no','word','pos'])

#Use for later on
df_for_prediction = df.copy()

In [25]:
df

Unnamed: 0,sentence_no,word,pos
0,1,show,VB
1,1,me,PRP
2,1,1980s,CD
3,1,action,NN
4,1,movies,NNS


In [26]:
#Get list of words from dataframe
tokenized_text = df['word'].tolist()

In [27]:
tokenized_text

['show', 'me', '1980s', 'action', 'movies']

In [28]:
#Call Word Vectorization API
word_vector_api_data = tokenized_text
session = requests.Session()
session.trust_env = False
session.post('http://127.0.0.1:5000/word_vectorization', json = word_vector_api_data) #add proxies args if needed

<Response [200]>

In [29]:
#Load the processed word vectors
with open(word_vectors_filepath, "rb") as t:
    word_vectors = pickle.load(t)

In [30]:
#Add word featues to dataframe
df['word_vec'] = word_vectors
df = pd.get_dummies(df, columns=['pos'])

In [31]:
#Find pos columns needed to be added
df_cols = list(df.columns)
add_pos_col = [add for add in all_pos_tags if add not in df_cols]

#Add missing pos columns
for added_pos in add_pos_col:
    df[added_pos] = 0

#Rearrange columns in fixed order for consistency in training data set throughout all texts
arrange_df_cols = ['sentence_no','word','word_vec']
for arrange_pos in all_pos_tags:
    arrange_df_cols.append(arrange_pos)
df = df.reindex(columns=arrange_df_cols)

In [32]:
df

Unnamed: 0,sentence_no,word,word_vec,pos_LS,pos_TO,pos_VBN,pos_'',pos_WP,pos_UH,pos_VBG,...,pos_MD,pos_VB,pos_WRB,pos_NNP,pos_EX,pos_NNS,pos_SYM,pos_CC,pos_CD,pos_POS
0,1,show,"[0.05406701, -0.15761112, -0.7017879, 0.234091...",0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
1,1,me,"[0.44699496, -0.6717845, -1.0470167, 0.9299539...",0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,1980s,"[0.5675778, 0.09792379, -0.026658123, 0.374034...",0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
3,1,action,"[0.31881523, -0.29548982, -0.08960247, 0.87853...",0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1,movies,"[0.2133574, -0.056065083, -1.0704498, 0.231683...",0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0


In [33]:
#Get the sentence feature vectors. Each sentence feature vector contains a list of all its word feature vectors.
df = df.drop(columns=['word'])
sentence_feature_vectors = {}
for index,row in df.iterrows():
    sentence_number = row[0]
    word_feature_vector = np.concatenate((row[1:]), axis = None)
    if sentence_number in sentence_feature_vectors.keys():
        sentence_feature_vectors[sentence_number].append(word_feature_vector)
    else:
        sentence_feature_vectors[sentence_number] = [word_feature_vector]

In [34]:
sentence_feature_vectors

{1: [array([ 0.05406701, -0.15761112, -0.7017879 ,  0.23409137,  0.49511296,
          0.5845138 , -0.21149723,  0.41470313, -0.66746515,  0.3139659 ,
         -0.22264327, -0.06002229, -0.26404837,  0.11272451, -0.07870636,
         -0.00662771, -0.09953663,  0.19331707, -0.65225816, -0.23743977,
          0.22146857,  0.44456705, -0.10762705, -0.02459927,  0.39042968,
          0.2107949 ,  0.09701276, -0.1647754 , -0.3796033 ,  0.01939271,
         -0.23600912,  0.01130283,  0.2009593 ,  0.05590365,  0.0929635 ,
         -0.14136171,  0.04371291, -0.08660819,  0.02132413, -0.14475128,
          0.7776699 , -0.3512436 , -0.50120497, -0.04874106,  0.31615117,
         -0.07757556,  0.19927543, -0.15894759, -0.02038171,  0.16272163,
          0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
          0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
          0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
          0.        ,  0.        , 

In [35]:
#Pad length for sentences and append to the input_sequence 
dummy_length = len(sentence_feature_vectors[1][0])
for sentence in sentence_feature_vectors.values():
    while len(sentence) < 80:
        sentence.append(np.array([0 for zero in range(dummy_length)]))
        
    input_sequence.append(np.array(sentence))

x = np.array(input_sequence)

In [36]:
x

array([[[ 0.05406701, -0.15761112, -0.70178789, ...,  0.        ,
          0.        ,  0.        ],
        [ 0.44699496, -0.67178452, -1.04701674, ...,  0.        ,
          0.        ,  0.        ],
        [ 0.56757778,  0.09792379, -0.02665812, ...,  0.        ,
          1.        ,  0.        ],
        ...,
        [ 0.        ,  0.        ,  0.        , ...,  0.        ,
          0.        ,  0.        ],
        [ 0.        ,  0.        ,  0.        , ...,  0.        ,
          0.        ,  0.        ],
        [ 0.        ,  0.        ,  0.        , ...,  0.        ,
          0.        ,  0.        ]]])

In [37]:
x.shape

(1, 80, 95)

## Prediction for all entities 

`np.argmax` is used to get the index of the tag with the highest probability in the arrays

In [38]:
#WITHOUT np.argmax
with session_predictor.as_default():
    result_dummy = model_predictor.predict(x)

print('Without np argmax the output sequence is a 3D output where each word contains one hot encodings: {}'.format(result_dummy.shape))
result_dummy

Without np argmax the output sequence is a 3D output where each word contains one hot encodings: (1, 80, 25)


array([[[1., 0., 0., ..., 0., 0., 0.],
        [1., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [1., 0., 0., ..., 0., 0., 0.],
        [1., 0., 0., ..., 0., 0., 0.],
        [1., 0., 0., ..., 0., 0., 0.]]], dtype=float32)

In [39]:
#Predict y values using x values and convert integer y to its correct entity. This is for all the entities except for location
with session_predictor.as_default():
    prediction = np.argmax(model_predictor.predict(x), axis=-1)
    
print('With np argmax the output sequence is a 2D output where each word is an index: {}'.format(prediction.shape))
prediction

With np argmax the output sequence is a 2D output where each word is an index: (1, 80)


array([[ 0,  0, 12,  4,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0]],
      dtype=int64)

#### Convert the index to its respective target

In [40]:
#Convert the index of the prediction to its respective target tag
predicted_tag = [[index_to_targets[i] for i in row] for row in prediction]

In [41]:
predicted_tag

[['O',
  'O',
  'B-YEAR',
  'B-GENRE',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O']]

<br>

#### Now that we have the predicted tags we need to get their respective words by getting the word in the same position as the target in its array

#### Create a list of lists of the words that has the same shape as predicted_tag list of lists

#### Create a dictionary where each key is a sentence number and each value is the words of that sentence
<br>
<br>

In [42]:
#Create a dictionary with sentence number as keys and the values as the words of the sentence 
sentences = {}

for index,row in df_for_prediction.iterrows():
    sentence_number = row[0]
    word = row[1]
    if sentence_number in sentences.keys():
        sentences[sentence_number].append(word)
    else:
        sentences[sentence_number] = [word]

In [43]:
sentences

{1: ['show', 'me', '1980s', 'action', 'movies']}

<br>

#### Add padding to each word list as the prediction contains padding too
<br>

In [44]:
word_sequence = []
for sentence in sentences.values():
    while len(sentence) < 80:
        sentence.append('padding')
    word_sequence.append(sentence)

In [45]:
word_sequence

[['show',
  'me',
  '1980s',
  'action',
  'movies',
  'padding',
  'padding',
  'padding',
  'padding',
  'padding',
  'padding',
  'padding',
  'padding',
  'padding',
  'padding',
  'padding',
  'padding',
  'padding',
  'padding',
  'padding',
  'padding',
  'padding',
  'padding',
  'padding',
  'padding',
  'padding',
  'padding',
  'padding',
  'padding',
  'padding',
  'padding',
  'padding',
  'padding',
  'padding',
  'padding',
  'padding',
  'padding',
  'padding',
  'padding',
  'padding',
  'padding',
  'padding',
  'padding',
  'padding',
  'padding',
  'padding',
  'padding',
  'padding',
  'padding',
  'padding',
  'padding',
  'padding',
  'padding',
  'padding',
  'padding',
  'padding',
  'padding',
  'padding',
  'padding',
  'padding',
  'padding',
  'padding',
  'padding',
  'padding',
  'padding',
  'padding',
  'padding',
  'padding',
  'padding',
  'padding',
  'padding',
  'padding',
  'padding',
  'padding',
  'padding',
  'padding',
  'padding',
  'padding'

<br>

#### Adding formatting back to numbers that are supposed to have decimal points

However, some words may still be attached to numbers such as '4th' as recognized by the word tokenizer and so we should not separate it. The same goes for numbers such as '200+'. 
<br>
<br>

```python
    re.findall('\d*\.?\d+',word)
``` 
searches for a number in the current word 

```python
    re.search('[a-zA-Z+]', word)
```
searches for text and plus sign in the word. 

If both text and a plus sign are present in the number we do not replace anything
<br>
<br>

In [46]:
#Outer loop loops over sentences
#Inner loop loops over words in the sentence

#Add decimal points back to numbers that have them

#Counter for moving through numbers list
counter = 0
for sentence in word_sequence:
    #Counter for tracking position of word in sentence
    curr_index = 0
    for word in sentence:
        
        #Check if all the formatted numbers have been iterated through
        if counter < len(numbers):
            
            #If a number is found and it is equal to the number in 
            #the numbers list with the decimal point removed
            if re.findall('\d*\.?\d+',word) == [numbers[counter].replace('.','')]:
                
                #If a word or plus is found in the number do not replace anything, 
                #move on to the next number
                if re.search('[a-zA-Z+]', word):
                    counter+=1
                else:
                    #Replace the number with its correct formatting
                    sentence.pop(curr_index)
                    sentence.insert(curr_index, numbers[counter])
                    counter += 1
        curr_index += 1

In [47]:
word_sequence

[['show',
  'me',
  '1980s',
  'action',
  'movies',
  'padding',
  'padding',
  'padding',
  'padding',
  'padding',
  'padding',
  'padding',
  'padding',
  'padding',
  'padding',
  'padding',
  'padding',
  'padding',
  'padding',
  'padding',
  'padding',
  'padding',
  'padding',
  'padding',
  'padding',
  'padding',
  'padding',
  'padding',
  'padding',
  'padding',
  'padding',
  'padding',
  'padding',
  'padding',
  'padding',
  'padding',
  'padding',
  'padding',
  'padding',
  'padding',
  'padding',
  'padding',
  'padding',
  'padding',
  'padding',
  'padding',
  'padding',
  'padding',
  'padding',
  'padding',
  'padding',
  'padding',
  'padding',
  'padding',
  'padding',
  'padding',
  'padding',
  'padding',
  'padding',
  'padding',
  'padding',
  'padding',
  'padding',
  'padding',
  'padding',
  'padding',
  'padding',
  'padding',
  'padding',
  'padding',
  'padding',
  'padding',
  'padding',
  'padding',
  'padding',
  'padding',
  'padding',
  'padding'

#### Initialize an empty dictionary to store the results

In [48]:
#Dictionary to convert categories to index
with open(target_to_index_filepath, "rb") as t:
    old_result_dict = pickle.load(t)

In [49]:
old_result_dict

{'O': 0,
 'B-ACTOR': 1,
 'B-CHARACTER': 2,
 'B-DIRECTOR': 3,
 'B-GENRE': 4,
 'B-PLOT': 5,
 'B-RATING': 6,
 'B-RATINGS_AVERAGE': 7,
 'B-REVIEW': 8,
 'B-SONG': 9,
 'B-TITLE': 10,
 'B-TRAILER': 11,
 'B-YEAR': 12,
 'I-ACTOR': 13,
 'I-CHARACTER': 14,
 'I-DIRECTOR': 15,
 'I-GENRE': 16,
 'I-PLOT': 17,
 'I-RATING': 18,
 'I-RATINGS_AVERAGE': 19,
 'I-REVIEW': 20,
 'I-SONG': 21,
 'I-TITLE': 22,
 'I-TRAILER': 23,
 'I-YEAR': 24}

In [50]:
for k,v in old_result_dict.items():
    old_result_dict[k]=[]

In [51]:
#Initialize result dictionary
old_result_dict

{'O': [],
 'B-ACTOR': [],
 'B-CHARACTER': [],
 'B-DIRECTOR': [],
 'B-GENRE': [],
 'B-PLOT': [],
 'B-RATING': [],
 'B-RATINGS_AVERAGE': [],
 'B-REVIEW': [],
 'B-SONG': [],
 'B-TITLE': [],
 'B-TRAILER': [],
 'B-YEAR': [],
 'I-ACTOR': [],
 'I-CHARACTER': [],
 'I-DIRECTOR': [],
 'I-GENRE': [],
 'I-PLOT': [],
 'I-RATING': [],
 'I-RATINGS_AVERAGE': [],
 'I-REVIEW': [],
 'I-SONG': [],
 'I-TITLE': [],
 'I-TRAILER': [],
 'I-YEAR': []}

### Getting the respective words of the targets and storing the results

Outer loop loops through sentences, inner loop loops through the sentence

Getting the corresponding word of the target now that both arrays have the exact same shape

In [52]:
#Outer loop loops through sentences, Inner loop loops through the sentence

#Set sentence counter to 0
sent_counter = 0

#Iterating over sentences
for sentence_prediction in predicted_tag:
    
    #set word counter to 0
    word_counter = 0
    for single_prediction in sentence_prediction:
        #if the target is not O
        if single_prediction != 'O':
            
            print('Predicted tag found: {}'.format(single_prediction))
            print('Its position in the word sequence is')
            print('    Sentence Number: {} , Index: {}'.format(sent_counter,word_counter))
            
            #Add the corresponding word of the predicted label to the old_result_dict using the correct subsets
            old_result_dict[single_prediction].append(word_sequence[sent_counter][word_counter])
            
            print('Added word: {} from word sequence using subsets {}, {}'.format(word_sequence[sent_counter][word_counter],sent_counter,word_counter))
            print(' ')
        
        word_counter+=1
    sent_counter+=1

Predicted tag found: B-YEAR
Its position in the word sequence is
    Sentence Number: 0 , Index: 2
Added word: 1980s from word sequence using subsets 0, 2
 
Predicted tag found: B-GENRE
Its position in the word sequence is
    Sentence Number: 0 , Index: 3
Added word: action from word sequence using subsets 0, 3
 


In [53]:
old_result_dict

{'O': [],
 'B-ACTOR': [],
 'B-CHARACTER': [],
 'B-DIRECTOR': [],
 'B-GENRE': ['action'],
 'B-PLOT': [],
 'B-RATING': [],
 'B-RATINGS_AVERAGE': [],
 'B-REVIEW': [],
 'B-SONG': [],
 'B-TITLE': [],
 'B-TRAILER': [],
 'B-YEAR': ['1980s'],
 'I-ACTOR': [],
 'I-CHARACTER': [],
 'I-DIRECTOR': [],
 'I-GENRE': [],
 'I-PLOT': [],
 'I-RATING': [],
 'I-RATINGS_AVERAGE': [],
 'I-REVIEW': [],
 'I-SONG': [],
 'I-TITLE': [],
 'I-TRAILER': [],
 'I-YEAR': []}

As each label has B-, I- tags, we need to consolidate the variations of the labels to its root label. i.e B-ACTOR and I-ACTOR should be considered as ACTOR.

We iterate through the dictionary and extract out the root label, and add the value

In [54]:
result_dict = {}
for k,v in old_result_dict.items():
    if k!='O':
        new_key = k.split('-')[1]
    else:
        new_key = 'O'
    if new_key not in result_dict.keys():
        result_dict[new_key] = v
    else:
        result_dict[new_key].extend(v)

In [55]:
result_dict

{'O': [],
 'ACTOR': [],
 'CHARACTER': [],
 'DIRECTOR': [],
 'GENRE': ['action'],
 'PLOT': [],
 'RATING': [],
 'RATINGS_AVERAGE': [],
 'REVIEW': [],
 'SONG': [],
 'TITLE': [],
 'TRAILER': [],
 'YEAR': ['1980s']}

We now join all the values in the arrays to make it cleaner

In [56]:
result_dict_clean = {}
for k,v in result_dict.items():
    result_dict_clean[k] = " ".join(v)

In [57]:
result_dict_clean

{'O': '',
 'ACTOR': '',
 'CHARACTER': '',
 'DIRECTOR': '',
 'GENRE': 'action',
 'PLOT': '',
 'RATING': '',
 'RATINGS_AVERAGE': '',
 'REVIEW': '',
 'SONG': '',
 'TITLE': '',
 'TRAILER': '',
 'YEAR': '1980s'}

In [58]:
result_df = pd.DataFrame.from_dict(result_dict_clean, orient='index')
result_df = result_df.transpose()

In [59]:
result_df = result_df.replace(to_replace=[None], value='')

## Final Result

In [60]:
result_df

Unnamed: 0,O,ACTOR,CHARACTER,DIRECTOR,GENRE,PLOT,RATING,RATINGS_AVERAGE,REVIEW,SONG,TITLE,TRAILER,YEAR
0,,,,,action,,,,,,,,1980s
