# IMDB movie review sentiment sample

Learn whether a movie review is positive or negative using the nutshell library.

Modify the default LSTM model to use a 1D Convolutional network instead.

Validation accuracy = 90.4%

In [2]:
import pandas as pd
import numpy as np

from nutshell import ModelData, Learner, TextReader

## Parse movie review txt files into lists of words

In [3]:
# read imdb movie review files into a list

# download data from github - https://github.com/jalbertbowden/large-movie-reviews-dataset/tree/master/acl-imdb-v1
# copy train & test subdirectories to this directory

reader = TextReader()
pos_texts = reader.read_text_files('./train/pos/*.txt')
neg_texts = reader.read_text_files('./train/neg/*.txt')

texts = pos_texts + neg_texts
labels = ([1] * len(pos_texts)) + ([0] * len(neg_texts))

# search and replace these values in each review
# treat periods and commas like words and strip off some characters
replacements = {'<br />': '', '"': '', '(': '( ',')': ' )', "'s ": " 's ",
                '?': ' ? ', '-': ' ', ', ': ' , ', '. ': ' . ', '*': ''}

for i in range(0,len(texts)):
    texts[i] = texts[i].lower()
    texts[i] = reader.multi_replace(texts[i], replacements)
        
# parse review text into lists of words (delimited by " ")

word_lists = []
for text in texts:
    word_list = text.split(' ')
    if len(word_list) > 1:
        word_lists.append(word_list)

print('Parsed', len(word_lists), 'reviews')   
        

Parsed 25000 reviews


## Format data for building a simple LSTM for classification
### - one that is able to predict whether the review sentiment is positive or negative

- The single input is a list of word token ids
 - The words in the review were tokenized in the prepare_data 
- The label is a 1 for positive and 0 for negative
- The model will output a floating point number between 0 and 1
 - Values >= .5 can be considered positive reviews


In [84]:
dfInput = pd.DataFrame()
dfInput['words'] = word_lists
dfInput['label'] = labels
data = ModelData(dfInput)
data.category_columns = ['words'] # indicates the contents are categories, not numeric values
data.sequence_columns = ['words'] # indicates the column contains a list of category values
data.label_column = 'label'
data.sequence_length = 1500 # almost all reviews are < 1000 words
data.validation_split = .10 
data.prepare_data()   

Tokenizing category columns...
words 153820 unique values
Done preparing data


In [85]:
data.split_data(shuffle=True)

Training examples: 22500
Validation examples: 2500


## Define Keras Model

Learner object will choose LSTM/Dropout layer sets for the sequential inputs.

After the default model is built, modify the model to use a 1D convolutional network instead of the LSTM.

In [89]:
# build model
learner = Learner(data)
learner.hidden_layers = 1 # number of lstm/dropout layer pairs
learner.dropout_rate = .30
learner.batch_size = 256
learner.gpu = True
learner.build_model()

Sequential Merge Layer Shape:  (?, 1500, 50)
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_words (InputLayer)     (None, 1500)              0         
_________________________________________________________________
embed_words (Embedding)      (None, 1500, 50)          7691200   
_________________________________________________________________
lstm_0 (LSTM)                (None, 1500, 24)          7200      
_________________________________________________________________
lstm_dropout_0 (Dropout)     (None, 1500, 24)          0         
_________________________________________________________________
lstm_timedist (TimeDistribut (None, 1500, 24)          600       
_________________________________________________________________
lstm_reshape (Reshape)       (None, 36000)             0         
_________________________________________________________________
dense_representation (Dense) (N

In [90]:
# Replace default LSTM layers with a convolutional layer
# Also add dropout to embedding layer
# Re-compile model

# You can just replace the default model with a new model, 
#  but I want to show that you can add to the existing model also

# base on this thread: https://github.com/keras-team/keras/issues/2296

from keras.layers import Dense, Dropout, Conv1D, Lambda
from keras.optimizers import Adam
from keras.models import Model
from keras import backend as K

def max1d(X):
    return K.max(X, axis=1)

filters = 256
filter_size = 5

# add dropout to embedding layer
learner.model.get_layer('embed_words').dropout=.20

# reference to cut and paste point
embed_words = learner.model.get_layer('embed_words').output

# define new layers - attach to references
x = Conv1D(filters, filter_size, strides=1, \
                    padding='valid', activation='relu', name='conv_0')(embed_words)
x = Lambda(max1d, output_shape=(filters,), name='conv_maxpool_0')(x)
x = Dense(filters, name='conv_dense_0')(x)
x = Dropout(learner.dropout_rate)(x)
x = Dense(learner.output_factors, name='dense_representation')(x)
output = Dense(1, name='dense_output')(x)

new_model = Model(inputs=[learner.model.get_layer('input_words').input], \
                  outputs=output)
new_model.compile(loss='mse', optimizer=Adam(), metrics=['acc'] )
learner.model = new_model

print(learner.model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_words (InputLayer)     (None, 1500)              0         
_________________________________________________________________
embed_words (Embedding)      (None, 1500, 50)          7691200   
_________________________________________________________________
conv_0 (Conv1D)              (None, 1496, 256)         64256     
_________________________________________________________________
conv_maxpool_0 (Lambda)      (None, 256)               0         
_________________________________________________________________
conv_dense_0 (Dense)         (None, 256)               65792     
_________________________________________________________________
dropout_30 (Dropout)         (None, 256)               0         
_________________________________________________________________
dense_representation (Dense) (None, 50)                12850     
__________

In [91]:
learner.train_model(filename='imdb_conv', epochs=2)

Super Epoch: 1
Learning Rate: 0.001
Train on 22500 samples, validate on 2500 samples
Epoch 1/2
Epoch 2/2


In [92]:
learner.train_model(filename='imdb_conv', learning_rate=.0001, epochs=1)

Super Epoch: 1
Learning Rate: 0.0001
Train on 22500 samples, validate on 2500 samples
Epoch 1/1


In [93]:
learner.train_model(filename='imdb_conv', learning_rate=.00001, epochs=1)

Super Epoch: 1
Learning Rate: 1e-05
Train on 22500 samples, validate on 2500 samples
Epoch 1/1


In [None]:
#0 conv - 1000 len/256x3filt/.30drop/.20embdrop - ep3/1/1 = .8908 valacc
#1 conv - 1000/256x5/.30/.20 - ep2 = .8944 
#2 conv - 1000/500x3/.3/.2 - - ep3/1 = .8940
#3 conv - 1000/128x5/.3/.2 - ep3/3/3 = .8840
#4 conv - 1500/256x5/.3/.2 - ep3/2/2 = .9040 #best