In [1]:
import os
curr_dir = os.getcwd()

## Filepath

In [2]:
movie_queries_filepath = os.path.join(curr_dir,'training_set','movie_queries.txt').replace('\\','/')
movie_queries_training_dataset_filepath = os.path.join(curr_dir,'training_set','movie_queries_training_dataset.csv').replace('\\','/')
index_to_target_filepath = os.path.join(curr_dir,'index_converter','index_to_target.txt').replace('\\','/')
target_to_index_filepath = os.path.join(curr_dir,'index_converter','target_to_index.txt').replace('\\','/')

## Scraping the training set online and saving it

In [3]:
import urllib.request
url = 'https://groups.csail.mit.edu/sls/downloads/movie/engtrain.bio'
urllib.request.urlretrieve(url, movie_queries_filepath)
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
import numpy as np
import pandas as pd
import string
import re 
import pickle

In [4]:
with open(movie_queries_filepath) as f:
    content = f.readlines()

## Example of lines in txt file

In [5]:
content[:10]

['O\twhat\n',
 'O\tmovies\n',
 'O\tstar\n',
 'B-ACTOR\tbruce\n',
 'I-ACTOR\twillis\n',
 '\n',
 'O\tshow\n',
 'O\tme\n',
 'O\tfilms\n',
 'O\twith\n']

## Example of splitting text into labels and words

In [6]:
content[0].strip('\n').split('\t')

['O', 'what']

In [7]:
content[5].strip('\n').split('\t')

['']

##  Splitting text into words and labels

Characters of words that are not letters, numbers or punctuation are removed as this will impact
* Sentence Tokenization
* Word Tokenization
* Word Vectorization

In [8]:
sentences = []
targets = []

temp_sentence = []
temp_targets = []

#Counter just acts as an index reference
counter=0
#Iterate over the list of extracted raw data.
for i in content:
    counter+=1
    #Split the raw extracted element data into its target label and word. Ex. O\twhat\n -> ['O','what']
    extracted = i.strip('\n').split('\t')
    
    #If the length of the split i.e.['O','what'] is 2, it means we are still iterating over a sentence
    if len(extracted)==2:
        #remove all characters that are not letters, numbers or punctuaton from the split
        clean = re.sub('[^a-zA-Z0-9.,]+','',extracted[1])
        #If the split does not contain letters, numbers or punctuation at all we remove it
        if len(clean)==0:
            print('Anomaly found {} at index {}'.format(extracted[1],counter-1))
            continue
        #Append the target label and word to a temp array after it has been processed
        temp_targets.append(extracted[0])
        temp_sentence.append(clean)

    #This code block will run only when we hit an empty split. Ex \n -> [''], where \n denotes the start of 
    #a new line. This means that we have finished iterating over a single sentence
    else:
        #Append the temp_targets array to the targets array, and join all the elements (words) in the temp_sentence array
        #and append it to the sentences array.
        targets.append(temp_targets)
        sentences.append(' '.join(temp_sentence))
        #Reset the temp arrays
        temp_sentence = []
        temp_targets = []

In [9]:
sentences[0]

'what movies star bruce willis'

In [10]:
targets[0]

['O', 'O', 'O', 'B-ACTOR', 'I-ACTOR']

## Perform part of speech tagging

Help model to learn the grammar of movie queries

In [11]:
#Add pos tags to each word of a sentence
sent_num = 0
pos_dict = {}
for sentence in sentences:
    pos_dict[sent_num] = nltk.pos_tag(word_tokenize(sentence))
    sent_num += 1

In [12]:
pos_dict[0]

[('what', 'WP'),
 ('movies', 'NNS'),
 ('star', 'VBP'),
 ('bruce', 'NN'),
 ('willis', 'NN')]

## Remove the punctuation

We need to remove these stray punctuations as it directly impacts word vectorization, they cannot be vectorized. We preserved the punctuation for part of speech tagging as it is key in sentence structure

In [13]:
#Remove stray . , from words using re sub. If the word value of the tuple only contains . , then remove it
for key,value in pos_dict.items():
    cleaned = []
    for pos_tuple in value:
        word_value, tag_value = pos_tuple
        checked = re.sub('[^a-zA-Z0-9]+','',word_value) 
        if len(checked)!=len(word_value):
            print('Found stray punctuation')
            print('Original: {}'.format(word_value))
            print('New: {}'.format(checked))
            print('')
            print('#################')
        if len(checked) == 0 :
            continue
        else:
            pos_tuple = tuple([checked, tag_value])
            cleaned.append(pos_tuple)

    pos_dict[key] = cleaned

Found stray punctuation
Original: ,
New: 

#################


## Making words and targets equal (if needed)

This is because the pos tagger may tokenize the words differently and result in an unequal length of words and corresponding targets. Examples can be seen below.

In [14]:
counter = 0
for k,v in pos_dict.items():
    if len(v)!=len(targets[counter]):
        print('Sentence number {} in the pos tag has an unequal length of words and corresponding targets'.format(k))
    counter+=1

Sentence number 2212 in the pos tag has an unequal length of words and corresponding targets
Sentence number 2526 in the pos tag has an unequal length of words and corresponding targets
Sentence number 3158 in the pos tag has an unequal length of words and corresponding targets
Sentence number 3815 in the pos tag has an unequal length of words and corresponding targets
Sentence number 9713 in the pos tag has an unequal length of words and corresponding targets


### Fix sentence 2212

In [15]:
sentences[2212]

'was there a boxing movie with the song gonna fly now'

In [16]:
pos_dict[2212]

[('was', 'VBD'),
 ('there', 'EX'),
 ('a', 'DT'),
 ('boxing', 'NN'),
 ('movie', 'NN'),
 ('with', 'IN'),
 ('the', 'DT'),
 ('song', 'NN'),
 ('gon', 'NN'),
 ('na', 'TO'),
 ('fly', 'VB'),
 ('now', 'RB')]

In [17]:
pos_dict[2212].pop(8)
pos_dict[2212].pop(8)
pos_dict[2212].insert(8,('gonna','NN'))

In [18]:
pos_dict[2212]

[('was', 'VBD'),
 ('there', 'EX'),
 ('a', 'DT'),
 ('boxing', 'NN'),
 ('movie', 'NN'),
 ('with', 'IN'),
 ('the', 'DT'),
 ('song', 'NN'),
 ('gonna', 'NN'),
 ('fly', 'VB'),
 ('now', 'RB')]

### Fix sentence 2526

In [19]:
sentences[2526]

'what movie uses the song i dont wanna miss a thing on its soundtrack'

In [20]:
pos_dict[2526]

[('what', 'WP'),
 ('movie', 'NN'),
 ('uses', 'VBZ'),
 ('the', 'DT'),
 ('song', 'NN'),
 ('i', 'NN'),
 ('dont', 'VBP'),
 ('wan', 'NN'),
 ('na', 'TO'),
 ('miss', 'VB'),
 ('a', 'DT'),
 ('thing', 'NN'),
 ('on', 'IN'),
 ('its', 'PRP$'),
 ('soundtrack', 'NN')]

In [21]:
pos_dict[2526].pop(7)
pos_dict[2526].pop(7)
pos_dict[2526].insert(7,('wanna','NN'))

In [22]:
pos_dict[2526]

[('what', 'WP'),
 ('movie', 'NN'),
 ('uses', 'VBZ'),
 ('the', 'DT'),
 ('song', 'NN'),
 ('i', 'NN'),
 ('dont', 'VBP'),
 ('wanna', 'NN'),
 ('miss', 'VB'),
 ('a', 'DT'),
 ('thing', 'NN'),
 ('on', 'IN'),
 ('its', 'PRP$'),
 ('soundtrack', 'NN')]

### Fix sentence 3158

In [23]:
sentences[3158]

'what the movie with you gotta friend in me'

In [24]:
pos_dict[3158]

[('what', 'WP'),
 ('the', 'DT'),
 ('movie', 'NN'),
 ('with', 'IN'),
 ('you', 'PRP'),
 ('got', 'VBP'),
 ('ta', 'JJ'),
 ('friend', 'NN'),
 ('in', 'IN'),
 ('me', 'PRP')]

In [25]:
pos_dict[3158].pop(5)
pos_dict[3158].pop(5)
pos_dict[3158].insert(5,('gotta','VBP'))

In [26]:
pos_dict[3158]

[('what', 'WP'),
 ('the', 'DT'),
 ('movie', 'NN'),
 ('with', 'IN'),
 ('you', 'PRP'),
 ('gotta', 'VBP'),
 ('friend', 'NN'),
 ('in', 'IN'),
 ('me', 'PRP')]

### Fix sentence 3815

In [27]:
sentences[3815]

'id like to see the 1975 movie with the quote youre gonna need a bigger boat'

In [28]:
pos_dict[3815]

[('id', 'NNS'),
 ('like', 'IN'),
 ('to', 'TO'),
 ('see', 'VB'),
 ('the', 'DT'),
 ('1975', 'CD'),
 ('movie', 'NN'),
 ('with', 'IN'),
 ('the', 'DT'),
 ('quote', 'JJ'),
 ('youre', 'NN'),
 ('gon', 'NN'),
 ('na', 'TO'),
 ('need', 'VB'),
 ('a', 'DT'),
 ('bigger', 'JJR'),
 ('boat', 'NN')]

In [29]:
pos_dict[3815].pop(11)
pos_dict[3815].pop(11)
pos_dict[3815].insert(11,('gonna','NN'))

In [30]:
pos_dict[3815]

[('id', 'NNS'),
 ('like', 'IN'),
 ('to', 'TO'),
 ('see', 'VB'),
 ('the', 'DT'),
 ('1975', 'CD'),
 ('movie', 'NN'),
 ('with', 'IN'),
 ('the', 'DT'),
 ('quote', 'JJ'),
 ('youre', 'NN'),
 ('gonna', 'NN'),
 ('need', 'VB'),
 ('a', 'DT'),
 ('bigger', 'JJR'),
 ('boat', 'NN')]

### Fix sentence 9713

In [31]:
sentences[9713]

'aguirre , wrath of god'

In [32]:
pos_dict[9713]

[('aguirre', 'NN'), ('wrath', 'NN'), ('of', 'IN'), ('god', 'NN')]

In [33]:
targets[9713]

['B-TITLE', 'I-TITLE', 'I-TITLE', 'I-TITLE', 'I-TITLE']

In [34]:
targets[9713].pop(1)

'I-TITLE'

In [35]:
targets[9713]

['B-TITLE', 'I-TITLE', 'I-TITLE', 'I-TITLE']

## Change targets to extended

This is done to make adding the targets to the training dataframe easier

In [36]:
targets_extended = []
for i in targets:
    targets_extended.extend(i)

## Creating target dataframe

In [37]:
L = [(k, *t) for k, v in pos_dict.items() for t in v]
df_target = pd.DataFrame(L, columns=['sentence_no','word','pos'])
df_target['target'] = targets_extended

In [38]:
df_target.head()

Unnamed: 0,sentence_no,word,pos,target
0,0,what,WP,O
1,0,movies,NNS,O
2,0,star,VBP,O
3,0,bruce,NN,B-ACTOR
4,0,willis,NN,I-ACTOR


In [39]:
df_target.to_csv(movie_queries_training_dataset_filepath)

## Getting the targets

This is to be used to convert targets to indexes vice-versa later on

In [40]:
all_targets = list(set(df_target['target'].tolist()))

In [41]:
all_targets

['I-TITLE',
 'B-TRAILER',
 'O',
 'B-TITLE',
 'B-CHARACTER',
 'I-RATINGS_AVERAGE',
 'I-TRAILER',
 'B-YEAR',
 'I-SONG',
 'B-RATING',
 'I-DIRECTOR',
 'I-YEAR',
 'B-GENRE',
 'I-RATING',
 'B-PLOT',
 'I-PLOT',
 'I-GENRE',
 'B-DIRECTOR',
 'I-ACTOR',
 'B-ACTOR',
 'B-SONG',
 'B-RATINGS_AVERAGE',
 'B-REVIEW',
 'I-REVIEW',
 'I-CHARACTER']

In [42]:
sorted_all_targets = sorted(all_targets)

In [43]:
sorted_all_targets

['B-ACTOR',
 'B-CHARACTER',
 'B-DIRECTOR',
 'B-GENRE',
 'B-PLOT',
 'B-RATING',
 'B-RATINGS_AVERAGE',
 'B-REVIEW',
 'B-SONG',
 'B-TITLE',
 'B-TRAILER',
 'B-YEAR',
 'I-ACTOR',
 'I-CHARACTER',
 'I-DIRECTOR',
 'I-GENRE',
 'I-PLOT',
 'I-RATING',
 'I-RATINGS_AVERAGE',
 'I-REVIEW',
 'I-SONG',
 'I-TITLE',
 'I-TRAILER',
 'I-YEAR',
 'O']

In [44]:
sorted_all_targets.pop(-1)
sorted_all_targets.insert(0, 'O')

In [45]:
target_to_index = {}
counter = 0
for i in sorted_all_targets:
    target_to_index[i] = counter
    counter+=1

In [46]:
target_to_index

{'O': 0,
 'B-ACTOR': 1,
 'B-CHARACTER': 2,
 'B-DIRECTOR': 3,
 'B-GENRE': 4,
 'B-PLOT': 5,
 'B-RATING': 6,
 'B-RATINGS_AVERAGE': 7,
 'B-REVIEW': 8,
 'B-SONG': 9,
 'B-TITLE': 10,
 'B-TRAILER': 11,
 'B-YEAR': 12,
 'I-ACTOR': 13,
 'I-CHARACTER': 14,
 'I-DIRECTOR': 15,
 'I-GENRE': 16,
 'I-PLOT': 17,
 'I-RATING': 18,
 'I-RATINGS_AVERAGE': 19,
 'I-REVIEW': 20,
 'I-SONG': 21,
 'I-TITLE': 22,
 'I-TRAILER': 23,
 'I-YEAR': 24}

In [47]:
index_to_target = dict([(value, key) for key, value in target_to_index.items()]) 

In [48]:
index_to_target

{0: 'O',
 1: 'B-ACTOR',
 2: 'B-CHARACTER',
 3: 'B-DIRECTOR',
 4: 'B-GENRE',
 5: 'B-PLOT',
 6: 'B-RATING',
 7: 'B-RATINGS_AVERAGE',
 8: 'B-REVIEW',
 9: 'B-SONG',
 10: 'B-TITLE',
 11: 'B-TRAILER',
 12: 'B-YEAR',
 13: 'I-ACTOR',
 14: 'I-CHARACTER',
 15: 'I-DIRECTOR',
 16: 'I-GENRE',
 17: 'I-PLOT',
 18: 'I-RATING',
 19: 'I-RATINGS_AVERAGE',
 20: 'I-REVIEW',
 21: 'I-SONG',
 22: 'I-TITLE',
 23: 'I-TRAILER',
 24: 'I-YEAR'}

In [49]:
with open(target_to_index_filepath, "wb") as t:
    pickle.dump(target_to_index, t)
    
with open(index_to_target_filepath, "wb") as t:
    pickle.dump(index_to_target, t)