In [1]:
from tensorflow.keras.datasets import reuters
(train_data, train_labels,), (test_data, test_labels) = reuters.load_data(num_words=10000)

import numpy as np
def vectorize_sequences(sequences, dimension = 10000):
    results = np.zeros((len(sequences), dimension))
    for i, sequence in enumerate(sequences):
        results[i, sequence] = 1.
    return results

x_train = vectorize_sequences(train_data)
x_test = vectorize_sequences(test_data)

y_train = np.asarray(train_labels).astype('float32')
y_test = np.asarray(test_labels).astype('float32')

from tensorflow.keras.utils import to_categorical # one hot encoder for lists
one_hot_train_labels = to_categorical(train_labels)
one_hot_test_labels = to_categorical(test_labels)

x_val = x_train[:1000]
partial_x_train = x_train[1000:]

y_val = one_hot_train_labels[:1000]
partial_y_train = one_hot_train_labels[1000:]

from tensorflow.keras import models
from tensorflow.keras import layers
model = models.Sequential()
model.add(layers.Dense(64, activation = 'relu', input_shape = (10000,)))
model.add(layers.Dense(64, activation = 'relu'))
model.add(layers.Dense(46, activation = 'softmax'))

model.compile(optimizer = 'rmsprop', loss = 'categorical_crossentropy', metrics = ['accuracy'])

history = model.fit(partial_x_train, 
                    partial_y_train,
                    epochs = 20,
                    batch_size = 512,
                    validation_data = (x_val, y_val),
                    verbose=0)

In [2]:
val_loss = history.history['val_loss']
opt_epochs = np.argmin(val_loss)

model = models.Sequential()
model.add(layers.Dense(64, activation = 'relu', input_shape = (10000,)))
model.add(layers.Dense(64, activation = 'relu'))
model.add(layers.Dense(46, activation = 'softmax'))

model.compile(optimizer = 'rmsprop',
             loss = 'categorical_crossentropy',
             metrics = ['accuracy'])

history = model.fit(x_train,
                    one_hot_train_labels,
                    epochs = opt_epochs,
                    batch_size = 512,
                    verbose = 0)

In [7]:
print(model.evaluate(x_test, one_hot_test_labels))

[0.9292786717414856, 0.7951914668083191]


The loss and accuracy of the trained model on the test set. The accuracy is...

Is this better than a random baseline model?

Is this better than a random baseline model?

In [8]:
import numpy as np
totals = np.zeros(46)
for i in range(len(train_labels)):
    totals[train_labels[i]] += 1
indx = np.argmax(totals)
hits = np.array(test_labels) == indx
print(np.sum(hits) / len(test_labels))

0.3619768477292965


Our baseline data model will predict solely according to the most populated class in the dataset... the accuracy based on informed guesswork is... 

In [9]:
predictions = model.predict(x_test)
print('wire', '\t', 'pred', '\t', 'label')
for i in range(10):
    print(i, '\t', np.argmax(predictions[i]), '\t', test_labels[i]) 

wire 	 pred 	 label
0 	 3 	 3
1 	 10 	 10
2 	 1 	 1
3 	 4 	 4
4 	 13 	 4
5 	 3 	 3
6 	 3 	 3
7 	 3 	 3
8 	 3 	 3
9 	 3 	 3


Predictions and labels for the first 10 wires in the test set.

In [3]:
def indices_of_wrong_predictions():
    indices = []
    for i in range(len(x_test)):
        y_pred = np.argmax(predictions[i])
        if(y_pred != test_labels[i]):
            indices.append(i)
    return indices        

In [4]:
def decode_wire(data):
    
    word_index = reuters.get_word_index()
    reverse_word_index = dict([(value, key) for (key,value) in word_index.items()])
    decoded_review = ' '.join([reverse_word_index.get(i - 3, '?') for i in data])
    
    return(decoded_review)

In [5]:
# https://github.com/keras-team/keras/issues/12072
topics = ['cocoa','grain','veg-oil','earn','acq','wheat','copper','housing','money-supply',
   'coffee','sugar','trade','reserves','ship','cotton','carcass','crude','nat-gas',
   'cpi','money-fx','interest','gnp','meal-feed','alum','oilseed','gold','tin',
   'strategic-metal','livestock','retail','ipi','iron-steel','rubber','heat','jobs',
   'lei','bop','zinc','orange','pet-chem','dlr','gas','silver','wpi','hog','lead']

In [6]:
def misprediction(i):
    wrongs = indices_of_wrong_predictions()
    oops = wrongs[i]
    pred = np.argmax(predictions[oops])
    print('predicted: ', topics[pred], '\texpected: ', topics[test_labels[oops]], '\n')
    print(decode_wire(test_data[oops]))


In [10]:
misprediction(0)

predicted:  ship 	expected:  acq 

? strong south ? winds were keeping many vessels trapped in the ice off the finnish and swedish coasts in one of the worst icy periods in the baltic for many years the finnish board of navigation said in finland and sweden up to 50 vessels were reported to be stuck in the ice and even the largest of the ? ? were having difficulties in breaking through to the ? ships ? officials said however icy conditions in the southern baltic at the soviet oil ports of ? and ? had eased they said weather officials in neighbouring sweden said the icy conditions in the baltic were the worst for 30 years with ships fighting a losing battle to keep moving in the coastal stretches of the gulf of ? which ? finland and sweden the ice is up to one ? thick with ? and ? packing it into almost ? walls three metres high swedish ? officials said weather forecasts say winds may ease during the weekend but a further drop in temperature could bring shipping to a standstill the offi

The first mistaken prediction...

In [11]:
misprediction(1)

predicted:  grain 	expected:  wheat 

? grain traders said they were still awaiting results of yesterday's u k intervention feed wheat tender for the home market the market sought to buy 340 000 tonnes more than double the remaining 150 000 tonnes available under the current tender however some of the tonnage included ? bids for supplies in the same stores since the tenders started last july ? 000 tonnes of british feed wheat have been sold back to the home market reuter 3


And the second...

And the third...