In [2]:
from keras.layers.core import Activation, Dense, Dropout, SpatialDropout1D
from keras.layers.embeddings import Embedding
from keras.layers.recurrent import LSTM
from keras.models import Sequential
from keras.preprocessing import sequence
# conda install -c conda-forge keras

from sklearn.model_selection import train_test_split
import collections
import matplotlib.pyplot as plt
import nltk
# conda install -c anaconda nltk

import numpy as np
import os
import pandas as pd

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [3]:
DATA_DIR = "./data"

maxlen = 0
word_freqs = collections.Counter()
num_recs = 0
train_data = pd.read_csv(os.path.join(DATA_DIR, "train.csv"))
train_data.head()

Unnamed: 0,sentence,label
0,I hate Harry Potter.,0
1,The first action theme to be played as the fir...,1
2,"Always knows what I want, not guy crazy, hates...",0
3,"Is it just me, or does Harry Potter suck?...",0
4,friday hung out with kelsie and we went and sa...,0


In [4]:
num_recs = len(train_data)
print(num_recs)

5668


In [5]:
# tokenize word

maxlen = 0 # maximum sentence length
word_freqs = collections.Counter() # counting hashtable objects, type:dict

for index, row in train_data.iterrows():
    words = nltk.word_tokenize(row['sentence'].lower())
    if (len(words) > maxlen):
        maxlen = len(words)
    for word in words:
        word_freqs[word] += 1
print(len(word_freqs))
print(maxlen)

# word_freqs: (word, freqs)

2094
42


In [31]:
MAX_FEATURES = 2094 # number of unique words
MAX_SENTENCE_LENGTH = 42 # maximum sentence length

In [32]:
vocab_size = MAX_FEATURES + 2
word2index = word2index = {x[0]: i+2 for i, x in enumerate(word_freqs.most_common(MAX_FEATURES))}
# word_freqs: (word, count)

word2index["PAD"] = 0
word2index["UNK"] = 1
# word2index: (word, index), from most to least

index2word = {v:k for k, v in word2index.items()}
# word2index: (index, word), from most to least

In [33]:
print(word2index)

{'i': 2, ',': 3, '.': 4, 'the': 5, 'and': 6, 'harry': 7, '!': 8, 'brokeback': 9, 'vinci': 10, 'mountain': 11, 'da': 12, 'code': 13, 'potter': 14, '...': 15, 'love': 16, 'is': 17, 'a': 18, 'was': 19, 'mission': 20, 'impossible': 21, 'awesome': 22, 'like': 23, 'it': 24, 'to': 25, 'movie': 26, 'that': 27, "'s": 28, 'because': 29, 'hate': 30, 'sucks': 31, 'sucked': 32, 'so': 33, 'as': 34, 'my': 35, '``': 36, 'much': 37, 'of': 38, 'movies': 39, 'stupid': 40, 'really': 41, 'you': 42, 'down': 43, 'we': 44, 'with': 45, 'but': 46, 'just': 47, 'one': 48, 'be': 49, 'potter..': 50, 'know': 51, 'out': 52, 'suck': 53, '/': 54, '3': 55, 'who': 56, 'or': 57, 'am': 58, 'into': 59, 'loved': 60, 'which': 61, 'for': 62, 'want': 63, 'right': 64, 'an': 65, "n't": 66, 'me': 67, 'this': 68, 'think': 69, 'are': 70, ':': 71, 'how': 72, 'not': 73, 'depressing': 74, 'if': 75, 'his': 76, 'in': 77, 'why': 78, 'there': 79, 'terrible': 80, 'people': 81, 'only': 82, 'would': 83, 'reading': 84, 'series': 85, 'oh': 86, 

In [34]:
X = np.empty((num_recs, ), dtype=list)
y = np.zeros((num_recs, ))
i = 0
for index, row in train_data.iterrows():
    words = nltk.word_tokenize(row['sentence'].lower())
    seqs = []
    for word in words:
        if word in word2index:
            seqs.append(word2index[word])
        else:
            seqs.append(word2index["UNK"])
    X[i] = seqs
    y[i] = int(row['label'])
    i += 1

X = sequence.pad_sequences(X, maxlen=MAX_SENTENCE_LENGTH)
print(X)

[[  0   0   0 ...   7  14   4]
 [  0   0   0 ...  89 592   4]
 [  0   0   0 ... 166   7  50]
 ...
 [  0   0   0 ...   7  14   4]
 [  0   0   0 ... 252 253   4]
 [  0   0   0 ...  22  26   4]]


In [35]:
EMBEDDING_SIZE = 128
HIDDEN_LAYER_SIZE = 64
BATCH_SIZE = 32
NUM_EPOCHS = 10

In [37]:
model = Sequential()
model.add(Embedding(vocab_size, EMBEDDING_SIZE, input_length=MAX_SENTENCE_LENGTH))
model.add(Dropout(0.2))
model.add(LSTM(HIDDEN_LAYER_SIZE, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(1))
model.add(Activation("sigmoid"))

model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

history = model.fit(X, y, batch_size=BATCH_SIZE, epochs=NUM_EPOCHS)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
