### Imports

In [1]:
import numpy as np
import pandas as pd #dataframe
import os #sanity check on subfolders being present
import re #data cleaning

### Data

#### reading input file

In [2]:
os.listdir('..\input')

['glove-global-vectors-for-word-representation',
 'movie-review-sentiment-analysis-kernels-only']

In [3]:
pd.set_option('display.max_colwidth', None)
df = pd.read_csv \
('..\\input\\movie-review-sentiment-analysis-kernels-only\\train.tsv', \
 delimiter='\t')
df.head(3)

Unnamed: 0,PhraseId,SentenceId,Phrase,Sentiment
0,1,1,"A series of escapades demonstrating the adage that what is good for the goose is also good for the gander , some of which occasionally amuses but none of which amounts to much of a story .",1
1,2,1,A series of escapades demonstrating the adage that what is good for the goose,2
2,3,1,A series,2


In [4]:
df = df[['Phrase', 'Sentiment']]
df.head(3)

Unnamed: 0,Phrase,Sentiment
0,"A series of escapades demonstrating the adage that what is good for the goose is also good for the gander , some of which occasionally amuses but none of which amounts to much of a story .",1
1,A series of escapades demonstrating the adage that what is good for the goose,2
2,A series,2


#### cleaning the data

In [5]:
#Turn url's into url, remove anything that's not alphanumeric or a space.
#Then lowercase what's left.
def clean_str(in_str):
    in_str = str(in_str)
    # replace urls with 'url'
    in_str = re.sub(r"(https?:\/\/(?:www\.|(?!www))[a-zA-Z0-9][a-zA-Z0-9-]+[a-zA-Z0-9]\.[^\s]{2,}|www\.[a-zA-Z0-9][a-zA-Z0-9-]+[a-zA-Z0-9]\.[^\s]{2,}|https?:\/\/(?:www\.|(?!www))[a-zA-Z0-9]\.[^\s]{2,}|www\.[a-zA-Z0-9]\.[^\s]{2,})", "url", in_str)
    in_str = re.sub(r'([^\s\w]|_)+', '', in_str)
    return in_str.strip().lower()


df['text'] = df['Phrase'].apply(clean_str)
df.head(3)

Unnamed: 0,Phrase,Sentiment,text
0,"A series of escapades demonstrating the adage that what is good for the goose is also good for the gander , some of which occasionally amuses but none of which amounts to much of a story .",1,a series of escapades demonstrating the adage that what is good for the goose is also good for the gander some of which occasionally amuses but none of which amounts to much of a story
1,A series of escapades demonstrating the adage that what is good for the goose,2,a series of escapades demonstrating the adage that what is good for the goose
2,A series,2,a series


#### balancing classes

Our data is classified into 5 different classes, very negative, slightly negative, neutral, slightly positive and very positive.

Sadly our dataset isn't balanced, so we need to do that ourselves

In [6]:
df.Sentiment.value_counts()

2    79582
3    32927
1    27273
4     9206
0     7072
Name: Sentiment, dtype: int64

In [7]:
df_0 = df[df['Sentiment'] == 0].sample(frac=1)
df_1 = df[df['Sentiment'] == 1].sample(frac=1)
df_2 = df[df['Sentiment'] == 2].sample(frac=1)
df_3 = df[df['Sentiment'] == 3].sample(frac=1)
df_4 = df[df['Sentiment'] == 4].sample(frac=1)

In [8]:
# we want a balanced set for training against - there are 7072 `0` examples
sample_size = 7072

data = pd.concat([df_0.head(sample_size),\
                  df_1.head(sample_size),\
                  df_2.head(sample_size),\
                  df_3.head(sample_size),\
                  df_4.head(sample_size)]).sample(frac=1)
data.head(10)

Unnamed: 0,Phrase,Sentiment,text
86903,genial but never inspired,2,genial but never inspired
59748,is insipid,1,is insipid
66820,to slap her creators because they 're clueless and inept,1,to slap her creators because they re clueless and inept
121870,Just another generic drama that has nothing going for it other than its exploitive array of obligatory cheap,0,just another generic drama that has nothing going for it other than its exploitive array of obligatory cheap
108298,'s nothing more satisfying during a summer of event movies than a spy thriller like The Bourne Identity that 's packed with just as much intelligence as action,4,s nothing more satisfying during a summer of event movies than a spy thriller like the bourne identity that s packed with just as much intelligence as action
122046,swear you are wet in some places,2,swear you are wet in some places
18636,", my 6-year-old nephew said ,",2,my 6yearold nephew said
66494,as boring and as obvious,1,as boring and as obvious
88066,Thumbs Friggin ' Down,0,thumbs friggin down
70918,uncommonly sincere,3,uncommonly sincere


### Word Embeddings

#### sequence length

In [9]:
data['l'] = data['Phrase'].apply(lambda x: len(str(x).split(' ')))
print("mean length of sentence: " + str(data.l.mean()))
print("max length of sentence: " + str(data.l.max()))
print("std dev length of sentence: " + str(data.l.std()))

mean length of sentence: 9.087132352941177
max length of sentence: 52
std dev length of sentence: 8.016948281468894


In [10]:
# these sentences aren't that long so we may as well use the whole string
sequence_length = 52

#### mapping words to integers

In [11]:
max_features = 20000 # this is the number of words we care about

In [12]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization

vectorizer = TextVectorization(max_tokens=max_features,\
                               output_sequence_length=sequence_length)
text_ds = tf.data.Dataset.from_tensor_slices(data['Phrase'].values).batch(128)
vectorizer.adapt(text_ds)

In [13]:
vectorizer.get_vocabulary()[:5]

[b'the', b'a', b'and', b'of', b'to']

Let's vectorize a test sentence:

In [14]:
output = vectorizer([["the cat sat on the mat"]])
output.numpy()[0, :6]

array([    2,  3914,  8825,    23,     2, 11367], dtype=int64)

Here's a dict mapping words to their indices:

In [15]:
voc = vectorizer.get_vocabulary()
word_index = dict(zip(voc, range(len(voc))))

WARNING!! - Notice that the index of 'the' is 0, but in the previous cell,  vectorizer output was 2. This shouldn't affect our program since we work ONLY out of word_index.

In [16]:
test = [b"the", b"cat", b"sat", b"on", b"the", b"mat"]
[word_index[w] for w in test]

[0, 3912, 8823, 21, 0, 11365]

#### vectorizing input

In [17]:
data['Phrase'].values[0]

'genial but never inspired'

In [18]:
X = vectorizer(np.array([[s] for s in data['Phrase'].values])).numpy()

In [19]:
path_to_glove_file = \
"..\\input\\glove-global-vectors-for-word-representation\\glove.6B.100d.txt"

embeddings_index = {}
with open(path_to_glove_file, encoding="utf8") as f:
    for line in f:
        word, coefs = line.split(maxsplit=1)
        coefs = np.fromstring(coefs, "f", sep=" ")
        embeddings_index[word] = coefs

print("Found %s word vectors." % len(embeddings_index))

Found 400000 word vectors.


### Model