# Advanced Supervised Deep Learning Models

## Import necessary depencencies

In [1]:
import pandas as pd
import numpy as np
import text_normalizer as tn
import model_evaluation_utils as meu

np.set_printoptions(precision=2, linewidth=80)

In [None]:
dataset = pd.read_csv(r'data\movie_reviews.csv')

# take a peek at the data
print(dataset.head())
reviews = np.array(dataset['review'])
sentiments = np.array(dataset['sentiment'])

# build train and test datasets
train_reviews = reviews[:35000]
train_sentiments = sentiments[:35000]
test_reviews = reviews[35000:]
test_sentiments = sentiments[35000:]

# normalize datasets
norm_train_reviews = tn.normalize_corpus(train_reviews)
norm_test_reviews = tn.normalize_corpus(test_reviews)

                                              review sentiment
0  One of the other reviewers has mentioned that ...  positive
1  A wonderful little production. <br /><br />The...  positive
2  I thought this was a wonderful way to spend ti...  positive
3  Basically there's a family where a little boy ...  negative
4  Petter Mattei's "Love in the Time of Money" is...  positive


## Tokenize train & test datasets

In [None]:
#we will first tokenize these datasets such that each text review is decomposed into its corresponding tokens (workflow Step 2)
tokenized_train = [tn.tokenizer.tokenize(text) for text in norm_train_reviews]
tokenized_test = [tn.tokenizer.tokenize(text) for text in norm_test_reviews]

## Build Vocabulary Mapping (word to index)

In [None]:
#For feature engineering (Step 3), we will be creating word embeddings. However, we will create them
#ourselves using keras instead of using pre-built ones like word2vec or GloVe.
#Word embeddings tend to vectorize text documents into fixed sized vectors such that these vectors try to capture
#contextual and semantic information.

#For generating embeddings, we will use the Embedding layer from keras, which requires documents
#to be represented as tokenized and numeric vectors. We already have tokenized text vectors in our
#tokenized_train and tokenized_text variables. However we would need to convert them into numeric
#representations. Besides this, we would also need the vectors to be of uniform size even though the
#tokenized text reviews will be of variable length due to the difference in number of tokens in each review. For
#this, one strategy could be to take the length of the longest review (with maximum number of tokens\words) 
#and set it as the vector size, let’s call this max_len. Reviews of shorter length can be padded with a PAD term 
#in the beginning to increase their length to max_len.

#We would need to create a word to index vocabulary mapping for representing each tokenized text 
#review in a numeric form. Do note you would also need to create a numeric mapping for the padding term
#which we shall call PAD_INDEX and assign it the numeric index of 0. For unknown terms, in case they are
#encountered later on in the test dataset or newer, previously unseen reviews, we would need to assign it to
#some index too. This would be because we will vectorize, engineer features, and build models only on the
#training data. Hence, if some new term should come up in the future (which was originally not a part of the
#model training), we will consider it as an out of vocabulary (OOV) term and assign it to a constant index (we
#will name this term NOT_FOUND_INDEX and assign it the index of vocab_size+1). The following snippet helps
#us create this vocabulary from our tokenized_train corpus of training text reviews.

from collections import Counter

# build word to index vocabulary
token_counter = Counter([token for review in tokenized_train for token in review])
vocab_map = {item[0]: index+1 for index, item in enumerate(dict(token_counter).items())}
max_index = np.max(list(vocab_map.values()))
vocab_map['PAD_INDEX'] = 0
vocab_map['NOT_FOUND_INDEX'] = max_index+1
vocab_size = len(vocab_map)
# view vocabulary size and part of the vocabulary map
print('Vocabulary Size:', vocab_size)
print('Sample slice of vocabulary map:', dict(list(vocab_map.items())[10:20]))