# IMDB movie review sentiment example

Build a quick LSTM model to learn whether a movie review is positive or negative using nutshell library

Validation accuracy: 87.2%

In [1]:
import pandas as pd
import numpy as np

from nutshell import ModelData, Learner, TextReader

Using TensorFlow backend.
  return f(*args, **kwds)


## Parse movie review txt files into lists of words

In [2]:
# read imdb movie review files into a list

# download data from github - https://github.com/jalbertbowden/large-movie-reviews-dataset/tree/master/acl-imdb-v1
# copy train & test subdirectories to this directory

reader = TextReader()
pos_texts = reader.read_text_files('./train/pos/*.txt')
neg_texts = reader.read_text_files('./train/neg/*.txt')

texts = pos_texts + neg_texts
labels = ([1] * len(pos_texts)) + ([0] * len(neg_texts))

# search and replace these values in each review
# treat periods and commas like words and strip off some characters
replacements = {'<br />': '', '"': '', '(': '( ',')': ' )', "'s ": " 's ",
                '?': ' ? ', '-': ' ', ', ': ' , ', '. ': ' . ', '*': ''}

for i in range(0,len(texts)):
    texts[i] = texts[i].lower()
    texts[i] = reader.multi_replace(texts[i], replacements)
        
# parse review text into lists of words (delimited by " ")

word_lists = []
for text in texts:
    word_list = text.split(' ')
    if len(word_list) > 1:
        word_lists.append(word_list)

print('Parsed', len(word_lists), 'reviews')   
        

Parsed 25000 reviews


## Format data for building a simple LSTM for classification
### - one that is able to predict whether the review sentiment is positive or negative

- The single input is a list of word token ids
 - The words in the review were tokenized in the prepare_data 
- The label is a 1 for positive and 0 for negative
- The model will output a floating point number between 0 and 1
 - Values >= .5 can be considered positive reviews


In [3]:
dfInput = pd.DataFrame()
dfInput['words'] = word_lists
dfInput['label'] = labels
data = ModelData(dfInput)
data.category_columns = ['words'] # indicates the contents are categories, not numeric values
data.sequence_columns = ['words'] # indicates the column contains a list of category values
data.label_column = 'label'
data.sequence_length = 1000 # almost all reviews are < 1000 words
data.validation_split = .10 
data.prepare_data()   

Tokenizing category columns...
words 153820 unique values
Done preparing data


In [4]:
data.split_data(shuffle=True)

Training examples: 22500
Validation examples: 2500


## Define Keras Model

Learner object will choose LSTM/Dropout layer sets for the sequential inputs

In [7]:
# build model
learner = Learner(data)
learner.hidden_layers = 2 # number of lstm/dropout layer pairs
learner.dropout_rate = .30
learner.batch_size = 256
learner.lstm_units = 256
learner.gpu = True
learner.build_model()


Sequential Merge Layer Shape:  (?, 1000, 50)
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_words (InputLayer)     (None, 1000)              0         
_________________________________________________________________
embed_words (Embedding)      (None, 1000, 50)          7691200   
_________________________________________________________________
lstm_0 (LSTM)                (None, 1000, 256)         314368    
_________________________________________________________________
lstm_dropout_0 (Dropout)     (None, 1000, 256)         0         
_________________________________________________________________
lstm_1 (LSTM)                (None, 1000, 256)         525312    
_________________________________________________________________
lstm_dropout_1 (Dropout)     (None, 1000, 256)         0         
_________________________________________________________________
lstm_timedist (TimeDistribut (N

In [8]:
learner.train_model(filename='imdb_simple', epochs=3)

Super Epoch: 1
Learning Rate: 0.001
Train on 22500 samples, validate on 2500 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3


In [9]:
learner.train_model(filename='imdb_simple', learning_rate=.0001, epochs=1)

Super Epoch: 1
Learning Rate: 0.0001
Train on 22500 samples, validate on 2500 samples
Epoch 1/1
