In [19]:
from keras.models import Sequential
from keras.layers.core import Dense, Dropout, Activation
from keras.layers.embeddings import Embedding
from keras.layers.recurrent import SimpleRNN
from keras.layers.recurrent import LSTM

In [20]:
model = Sequential()

In [21]:
model.add(Embedding(output_dim=32,input_dim=3800,input_length=380))
model.add(Dropout(0.35))

In [22]:
model.add(LSTM(32))

In [23]:
model.add(Dense(units=256, activation='relu'))
model.add(Dropout(0.35))
model.add(Dense(units=1, activation='sigmoid'))

In [24]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, 380, 32)           121600    
_________________________________________________________________
dropout_4 (Dropout)          (None, 380, 32)           0         
_________________________________________________________________
lstm_1 (LSTM)                (None, 32)                8320      
_________________________________________________________________
dense_3 (Dense)              (None, 256)               8448      
_________________________________________________________________
dropout_5 (Dropout)          (None, 256)               0         
_________________________________________________________________
dense_4 (Dense)              (None, 1)                 257       
Total params: 138,625
Trainable params: 138,625
Non-trainable params: 0
_________________________________________________________________


In [25]:
import re
def rm_tags(text):
    re_tag = re.compile(r'<[^>]+>')
    return re_tag.sub('', text)
import os
def read_files(filetype):
    path = "data/aclImdb/"
    file_list = []
    
    positive_path = path + filetype + "/pos/"
    for f in os.listdir(positive_path):
        file_list += [positive_path+f]
        
    negative_path = path + filetype + "/neg/"
    for f in os.listdir(negative_path):
        file_list += [negative_path+f]
    
    print('read', filetype, 'files:', len(file_list))
    
    all_labels = ([1] * 12500 + [0] * 12500)
    
    all_texts = []
    for fi in file_list:
        with open(fi, encoding='utf8') as file_input:
            all_texts += [rm_tags(" ".join(file_input.readlines()))]
            
    return all_labels, all_texts

In [26]:
y_train, train_text = read_files("train")
y_test, test_text = read_files("test")

read train files: 25000
read test files: 25000


In [27]:
from keras.preprocessing.text import Tokenizer
token = Tokenizer(num_words=3800)
token.fit_on_texts(train_text)

In [28]:
from keras.preprocessing import sequence
x_train_seq = token.texts_to_sequences(train_text)
x_test_seq = token.texts_to_sequences(test_text)
x_train = sequence.pad_sequences(x_train_seq, maxlen=380)
x_test = sequence.pad_sequences(x_test_seq, maxlen=380)

In [29]:
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [30]:
train_history = model.fit(x_train, y_train, batch_size=100, epochs=10, verbose=2, validation_split=0.2)

Train on 20000 samples, validate on 5000 samples
Epoch 1/10
 - 92s - loss: 0.4803 - acc: 0.7554 - val_loss: 0.5571 - val_acc: 0.7190
Epoch 2/10
 - 78s - loss: 0.2798 - acc: 0.8865 - val_loss: 0.6674 - val_acc: 0.7446
Epoch 3/10
 - 79s - loss: 0.2333 - acc: 0.9086 - val_loss: 0.5179 - val_acc: 0.7848
Epoch 4/10
 - 76s - loss: 0.2092 - acc: 0.9202 - val_loss: 0.4383 - val_acc: 0.8092
Epoch 5/10
 - 73s - loss: 0.1898 - acc: 0.9264 - val_loss: 0.3416 - val_acc: 0.8430
Epoch 6/10
 - 75s - loss: 0.1843 - acc: 0.9309 - val_loss: 0.4258 - val_acc: 0.8320
Epoch 7/10
 - 75s - loss: 0.1623 - acc: 0.9409 - val_loss: 0.3820 - val_acc: 0.8438
Epoch 8/10
 - 87s - loss: 0.1533 - acc: 0.9440 - val_loss: 0.3638 - val_acc: 0.8570
Epoch 9/10
 - 81s - loss: 0.1628 - acc: 0.9393 - val_loss: 0.5939 - val_acc: 0.7936
Epoch 10/10
 - 80s - loss: 0.1332 - acc: 0.9511 - val_loss: 0.4261 - val_acc: 0.8336


In [31]:
scores = model.evaluate(x_test, y_test, verbose=1)
scores[1]



0.86184