In [1]:
from keras.datasets import reuters
import keras
from keras.models import Sequential
from keras.layers import Dense, Activation, Dropout
from keras.utils import np_utils
from keras.preprocessing.text import Tokenizer
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

Using TensorFlow backend.


In [2]:
max_words = 1000 #maximum number of words to be taken into consideration in each article
batch_size = 32 #number of articles to be trained on in one go
nb_epoch = 100 #total number of times one wants to fit the model

In [3]:
(x_train, y_train), (x_test, y_test) = reuters.load_data(num_words=None, test_split = 0.2)
word_to_index = reuters.get_word_index()

index_to_word = {}
for keys, values in word_to_index.items(): index_to_word[values] = keys

In [4]:
index_to_word


{10996: 'mdbl',
 16260: 'fawc',
 12089: 'degussa',
 8803: 'woods',
 13796: 'hanging',
 20672: 'localized',
 20673: 'sation',
 20675: 'chanthaburi',
 10997: 'refunding',
 8804: 'hermann',
 20676: 'passsengers',
 20677: 'stipulate',
 8352: 'heublein',
 20713: 'screaming',
 16261: 'tcby',
 185: 'four',
 1642: 'grains',
 20680: 'broiler',
 12090: 'wooden',
 1220: 'wednesday',
 13797: 'highveld',
 7593: 'duffour',
 20681: '0053',
 3914: 'elections',
 2563: '270',
 3551: '271',
 5113: '272',
 3552: '273',
 3400: '274',
 7975: 'rudman',
 3401: '276',
 3478: '277',
 3632: '278',
 4309: '279',
 9381: 'dormancy',
 7247: 'errors',
 3086: 'deferred',
 20683: 'sptnd',
 8805: 'cooking',
 20684: 'stratabit',
 16262: 'designing',
 20685: 'metalurgicos',
 13798: 'databank',
 20686: '300er',
 20687: 'shocks',
 7972: 'nawg',
 20688: 'tnta',
 20689: 'perforations',
 2891: 'affiliates',
 20690: '27p',
 16263: 'ching',
 595: 'china',
 16264: 'wagyu',
 3189: 'affiliated',
 16265: 'chino',
 16266: 'chinh',
 2

In [5]:
num_classes = max(y_train) + 1
num_classes

46

In [6]:
print(f'# of training samples: {len(x_train)}')
print(f'# of testing samples: {len(x_test)}')

print(f'# of classes: {num_classes}')

# of training samples: 8982
# of testing samples: 2246
# of classes: 46


In [7]:
# Let's try to decode one of the articles in x_train

' '.join([index_to_word[index] for index in x_train[0]])

'the wattie nondiscriminatory mln loss for plc said at only ended said commonwealth could 1 traders now april 0 a after said from 1985 and from foreign 000 april 0 prices its account year a but in this mln home an states earlier and rise and revs vs 000 its 16 vs 000 a but 3 psbr oils several and shareholders and dividend vs 000 its all 4 vs 000 1 mln agreed largely april 0 are 2 states will billion total and against 000 pct dlrs'

In [8]:
tokenizer = Tokenizer(num_words=max_words)
x_train = tokenizer.sequences_to_matrix(x_train, mode='binary')
x_test = tokenizer.sequences_to_matrix(x_test, mode='binary')

In [11]:
x_train.shape

(8982, 1000)

In [12]:
y_train

array([ 3,  4,  3, ..., 25,  3, 25], dtype=int64)

In [13]:
# vectorizing the labels

y_train = np_utils.to_categorical(y_train, num_classes)
y_test = np_utils.to_categorical(y_test, num_classes)

In [14]:
y_train

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]], dtype=float32)

In [15]:
y_train.shape

(8982, 46)

Baseline Model (Softmax)

In [16]:
model = Sequential()
model.add(Dense(input_shape=(max_words,), units=num_classes))
model.add(Activation('softmax'))

model.compile(loss='categorical_crossentropy', 
              optimizer='sgd', 
              metrics=['accuracy'])

history = model.fit(x_train, y_train, 
          epochs=nb_epoch, 
          batch_size=batch_size, 
          validation_split=0.1, 
          verbose=2)

score = model.evaluate(x_test, y_test, batch_size=batch_size, verbose=2)
print("Test score: ", score[0])
print("Test accuracy: ", score[1])

Fully connected + Dropout + Relu + Relu + Dropout + Fully connected + Softmax (MLP)


In [20]:
model = Sequential()

model.add(Dense(input_shape=(max_words,), units=512))
model.add(Dropout(0.5))
model.add(Activation('relu'))

model.add(Activation('relu'))
model.add(Dropout(0.5))
model.add(Dense(num_classes))

model.add(Activation('softmax'))

(8982, 46)


(2246, 46)

In [26]:
model.compile(loss='categorical_crossentropy', 
              optimizer='sgd', 
              metrics=['accuracy'])

history = model.fit(x_train, y_train, 
          epochs=nb_epoch, 
          batch_size=batch_size, 
          validation_split=0.1, 
          verbose=2)

score = model.evaluate(x_test, y_test, batch_size=batch_size, verbose=2)
print("Test score: ", score[0])
print("Test accuracy: ", score[1])

Train on 8083 samples, validate on 899 samples
Epoch 1/100
 - 2s - loss: 2.5397 - acc: 0.4324 - val_loss: 1.9759 - val_acc: 0.5261
Epoch 2/100
 - 1s - loss: 1.9275 - acc: 0.5285 - val_loss: 1.7732 - val_acc: 0.5784
Epoch 3/100
 - 1s - loss: 1.7760 - acc: 0.5665 - val_loss: 1.6647 - val_acc: 0.6185
Epoch 4/100
 - 1s - loss: 1.6744 - acc: 0.5935 - val_loss: 1.5895 - val_acc: 0.6440
Epoch 5/100
 - 1s - loss: 1.6074 - acc: 0.6186 - val_loss: 1.5308 - val_acc: 0.6641
Epoch 6/100
 - 1s - loss: 1.5121 - acc: 0.6442 - val_loss: 1.4825 - val_acc: 0.6785
Epoch 7/100
 - 1s - loss: 1.4744 - acc: 0.6551 - val_loss: 1.4398 - val_acc: 0.6874
Epoch 8/100
 - 1s - loss: 1.4221 - acc: 0.6689 - val_loss: 1.4060 - val_acc: 0.7008
Epoch 9/100
 - 1s - loss: 1.3807 - acc: 0.6765 - val_loss: 1.3750 - val_acc: 0.7063
Epoch 10/100
 - 1s - loss: 1.3378 - acc: 0.6885 - val_loss: 1.3495 - val_acc: 0.7152
Epoch 11/100
 - 1s - loss: 1.3041 - acc: 0.6943 - val_loss: 1.3242 - val_acc: 0.7230
Epoch 12/100
 - 1s - loss: 

Epoch 97/100
 - 1s - loss: 0.4677 - acc: 0.8870 - val_loss: 0.9274 - val_acc: 0.7875
Epoch 98/100
 - 1s - loss: 0.4777 - acc: 0.8858 - val_loss: 0.9261 - val_acc: 0.7920
Epoch 99/100
 - 1s - loss: 0.4577 - acc: 0.8835 - val_loss: 0.9262 - val_acc: 0.7920
Epoch 100/100
 - 1s - loss: 0.4636 - acc: 0.8861 - val_loss: 0.9251 - val_acc: 0.7931
Test score:  0.8743165607231596
Test accuracy:  0.7920747996438112


In [27]:
print(history.history.keys())

dict_keys(['val_loss', 'val_acc', 'loss', 'acc'])
