In [1]:
# Imports
import numpy as np
import keras
from keras.datasets import imdb
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation
from keras.preprocessing.text import Tokenizer
import matplotlib.pyplot as plt
from keras.preprocessing import sequence
from keras.callbacks import ReduceLROnPlateau
from keras.callbacks import EarlyStopping
%matplotlib inline
from keras.optimizers import SGD

np.random.seed(30)

Using TensorFlow backend.


In [2]:
# Loading the data (it's preloaded in Keras)
(x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=5000,skip_top=50)

print(x_train.shape)
print(x_test.shape)

Downloading data from https://s3.amazonaws.com/text-datasets/imdb.npz
(25000,)


In [3]:
print(x_train[0])
print(y_train[0])

[2, 2, 2, 2, 2, 530, 973, 1622, 1385, 65, 458, 4468, 66, 3941, 2, 173, 2, 256, 2, 2, 100, 2, 838, 112, 50, 670, 2, 2, 2, 480, 284, 2, 150, 2, 172, 112, 167, 2, 336, 385, 2, 2, 172, 4536, 1111, 2, 546, 2, 2, 447, 2, 192, 50, 2, 2, 147, 2025, 2, 2, 2, 2, 1920, 4613, 469, 2, 2, 71, 87, 2, 2, 2, 530, 2, 76, 2, 2, 1247, 2, 2, 2, 515, 2, 2, 2, 626, 2, 2, 2, 62, 386, 2, 2, 316, 2, 106, 2, 2, 2223, 2, 2, 480, 66, 3785, 2, 2, 130, 2, 2, 2, 619, 2, 2, 124, 51, 2, 135, 2, 2, 1415, 2, 2, 2, 2, 215, 2, 77, 52, 2, 2, 407, 2, 82, 2, 2, 2, 107, 117, 2, 2, 256, 2, 2, 2, 3766, 2, 723, 2, 71, 2, 530, 476, 2, 400, 317, 2, 2, 2, 2, 1029, 2, 104, 88, 2, 381, 2, 297, 98, 2, 2071, 56, 2, 141, 2, 194, 2, 2, 2, 226, 2, 2, 134, 476, 2, 480, 2, 144, 2, 2, 2, 51, 2, 2, 224, 92, 2, 104, 2, 226, 65, 2, 2, 1334, 88, 2, 2, 283, 2, 2, 4472, 113, 103, 2, 2, 2, 2, 2, 178, 2]
1


In [4]:
# One-hot encoding the training data into vector mode, each of length 1000
tokenizer = Tokenizer(num_words=1000)
x_train = tokenizer.sequences_to_matrix(x_train, mode='binary')
x_test = tokenizer.sequences_to_matrix(x_test, mode='binary')
print(x_train.shape)

(25000, 1000)


In [5]:
# One-hot encoding the testing data
num_classes = 2
y_train = keras.utils.to_categorical(y_train, num_classes)
y_test = keras.utils.to_categorical(y_test, num_classes)
print(y_train.shape)
print(y_test.shape)

(25000, 2)
(25000, 2)


In [6]:
# Limiting the length of reviews to 1000 words. Truncating reviews longer than this
max_words = 1000
x_train = sequence.pad_sequences(x_train, maxlen=max_words)
x_test = sequence.pad_sequences(x_test, maxlen=max_words)

In [7]:
# Building the model architecture with one layer of length 100
model = Sequential()
model.add(Dense(512, activation='sigmoid', input_dim=1000))
model.add(Dropout(0.5))
#model.add(Dense(64, activation='sigmoid'))
#model.add(Dropout(0.5))
model.add(Dense(num_classes, activation='softmax'))
model.summary()

# Compiling the model using categorical_crossentropy loss, and rmsprop optimizer.
model.compile(loss='categorical_crossentropy',
              optimizer='rmsprop',
              metrics=['accuracy'])

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_1 (Dense)              (None, 512)               512512    
_________________________________________________________________
dropout_1 (Dropout)          (None, 512)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 2)                 1026      
Total params: 513,538
Trainable params: 513,538
Non-trainable params: 0
_________________________________________________________________


In [8]:
# Running and evaluating the model

# Reducing learning rate
# reduce_lr = ReduceLROnPlateau(monitor='val_acc', factor=0.2,
#                              patience=5, min_lr=0.001)

# Early stopping
es = EarlyStopping(monitor='val_acc',
                              min_delta=1e-3,
                              patience=10,
                              verbose=0, mode='auto')
model.fit(x_train, y_train,
          callbacks=[es],
          batch_size=32,
          epochs=50,
          validation_split=0.2,
          verbose=2)

Train on 20000 samples, validate on 5000 samples
Epoch 1/50
8s - loss: 0.4332 - acc: 0.7948 - val_loss: 0.3290 - val_acc: 0.8620
Epoch 2/50
7s - loss: 0.3419 - acc: 0.8551 - val_loss: 0.3238 - val_acc: 0.8592
Epoch 3/50
7s - loss: 0.3262 - acc: 0.8614 - val_loss: 0.3397 - val_acc: 0.8518
Epoch 4/50
7s - loss: 0.3200 - acc: 0.8632 - val_loss: 0.3261 - val_acc: 0.8568
Epoch 5/50
7s - loss: 0.3144 - acc: 0.8681 - val_loss: 0.3237 - val_acc: 0.8608
Epoch 6/50
7s - loss: 0.3104 - acc: 0.8698 - val_loss: 0.3278 - val_acc: 0.8620
Epoch 7/50
7s - loss: 0.3072 - acc: 0.8711 - val_loss: 0.3246 - val_acc: 0.8642
Epoch 8/50
7s - loss: 0.3059 - acc: 0.8724 - val_loss: 0.3260 - val_acc: 0.8636
Epoch 9/50
7s - loss: 0.3033 - acc: 0.8728 - val_loss: 0.3344 - val_acc: 0.8578
Epoch 10/50
7s - loss: 0.3016 - acc: 0.8746 - val_loss: 0.3280 - val_acc: 0.8634
Epoch 11/50
7s - loss: 0.2998 - acc: 0.8728 - val_loss: 0.3297 - val_acc: 0.8632
Epoch 12/50
7s - loss: 0.2977 - acc: 0.8747 - val_loss: 0.3307 - val_

<keras.callbacks.History at 0x1f3a70a2a58>

In [11]:
# Evaluating the model on the training and testing set
score = model.evaluate(x_train, y_train)
print("\n Training Accuracy:", score[1])
score = model.evaluate(x_test, y_test, verbose=0)
print("Testing Accuracy: ", score[1])

 Training Accuracy: 0.883
Testing Accuracy:  0.859
