## RNN: Sentiment Analysis

In [1]:
import os
imdb_dir = 'datasets/aclImdb'
train_dir = os.path.join(imdb_dir, 'train')
labels = []
texts = []
for label_type in ['neg', 'pos']:
    dir_name = os.path.join(train_dir, label_type)
    for fname in os.listdir(dir_name):
        if fname[-4:] == '.txt':
            f = open(os.path.join(dir_name, fname))
            texts.append(f.read())
            f.close()
            if label_type == 'neg':
                labels.append(0)
            else:
                labels.append(1)

In [4]:
print(len(texts))
print(texts[0])

25000
Working with one of the best Shakespeare sources, this film manages to be creditable to it's source, whilst still appealing to a wider audience.<br /><br />Branagh steals the film from under Fishburne's nose, and there's a talented cast on good form.


## Preparing the Data

In [8]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
import numpy as np

# maximum review length
maxlen = 100
validation_samples = 10000 
# maximum words in our vocabulary
max_words = 10000

tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)

word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))
data = pad_sequences(sequences, maxlen=maxlen)
labels = np.asarray(labels)
print('Shape of data tensor:', data.shape)
print('Shape of label tensor:', labels.shape)
print(data[0])

Found 88582 unique tokens.
Shape of data tensor: (25000, 100)
Shape of label tensor: (25000,)
[   0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0  777   16   28    4    1  115 2278 6887   11   19
 1025    5   27    5   42 2425 1861  128 2270    5    3 6985  308    7
    7 3383 2373    1   19   36  463 3169    2  222    3 1016  174   20
   49  808]


In [65]:
words = []
for el in data[0]:
    w = [k for k,v in word_index.items() if v==el]
    if w:
        words.append(w[0])

print(" ".join(words))

is a labor of love for paul reiser who understands what it's like to be both a father and a son as well as to have both laughter and tears as you move through life the most fun part though was watching reiser watch falk you could tell it was both his character coming to a new appreciation of his father and a fellow actor really enjoying peter special craft really delightful let's hope this film makes it into theaters around the country sometime soon so everyone can have a chance to laugh and cry with paul reiser and folks


In [18]:
from sklearn.model_selection import train_test_split

indices = np.arange(data.shape[0])
np.random.shuffle(indices)
data = data[indices]
labels = labels[indices]

x_train, x_val, y_train, y_val = train_test_split(data, labels, test_size=0.3)

## Load Embeddings

In [19]:
glove_dir = 'datasets/glove.6B'
embeddings_index = {}
f = open(os.path.join(glove_dir, 'glove.6B.100d.txt'))
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()
print('Found %s word vectors.' % len(embeddings_index))

Found 400000 word vectors.


## Build Embedding Matrx

In [20]:
embedding_dim = 100
embedding_matrix = np.zeros((max_words, embedding_dim))
for word, i in word_index.items():
    if i < max_words:
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
                    embedding_matrix[i] = embedding_vector

## Model1: non trainable embeddings

In [42]:
from keras.models import Sequential
from keras.layers import Embedding, Flatten, Dense, LSTM, Conv1D, MaxPooling1D

model = Sequential()
model.add(Embedding(max_words,
                    embedding_dim,
                    weights=[embedding_matrix],
                    input_length=maxlen,
                    trainable=False))
model.add(LSTM(embedding_dim, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(1, activation='sigmoid'))
# or 
# model.layers[0].set_weights([embedding_matrix])
# model.layers[0].trainable = False
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_11 (Embedding)     (None, 100, 100)          1000000   
_________________________________________________________________
lstm_7 (LSTM)                (None, 100)               80400     
_________________________________________________________________
dense_7 (Dense)              (None, 1)                 101       
Total params: 1,080,501
Trainable params: 80,501
Non-trainable params: 1,000,000
_________________________________________________________________


In [34]:
model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['acc'])
history = model.fit(x_train, y_train,
                    epochs=5,
                    batch_size=32,
                    validation_data=(x_val, y_val))

Train on 17500 samples, validate on 7500 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


## Model2: trainable embeddings

In [36]:
model2 = Sequential()
model2.add(Embedding(max_words,
                    embedding_dim,
                    input_length=maxlen))
model2.add(LSTM(embedding_dim, dropout=0.2, recurrent_dropout=0.2))
model2.add(Dense(1, activation='sigmoid'))
model2.layers[0].set_weights([embedding_matrix])

In [38]:
model2.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['acc'])
history = model2.fit(x_train, y_train,
                    epochs=5,
                    batch_size=32,
                    validation_data=(x_val, y_val))

Train on 17500 samples, validate on 7500 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


## Model3: RNNs and CNNs for text classification

In [44]:
model3 = Sequential()
model3.add(Embedding(max_words,embedding_dim,input_length=maxlen))
model3.add(Conv1D(filters=32, kernel_size=3, padding='same', activation='relu'))
model3.add(MaxPooling1D(pool_size=2))
model3.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2))
model3.add(Dense(1, activation='sigmoid'))
model3.layers[0].set_weights([embedding_matrix])
model3.layers[0].trainable = False
print(model3.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_13 (Embedding)     (None, 100, 100)          1000000   
_________________________________________________________________
conv1d_3 (Conv1D)            (None, 100, 32)           9632      
_________________________________________________________________
max_pooling1d_2 (MaxPooling1 (None, 50, 32)            0         
_________________________________________________________________
lstm_9 (LSTM)                (None, 100)               53200     
_________________________________________________________________
dense_9 (Dense)              (None, 1)                 101       
Total params: 1,062,933
Trainable params: 62,933
Non-trainable params: 1,000,000
_________________________________________________________________
None


In [45]:
model3.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['acc'])
history = model3.fit(x_train, y_train,
                    epochs=5,
                    batch_size=32,
                    validation_data=(x_val, y_val))

Train on 17500 samples, validate on 7500 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
