# Setup

In [1]:
# https://github.com/fchollet/keras-resources

# Dataset of 11,228 newswires from Reuters, labeled over 46 topics. 
# As with the IMDB dataset, each wire is encoded as a sequence of word indexes 
# (same conventions).

import numpy as np
import keras
from keras.datasets import reuters
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation
from keras.preprocessing.text import Tokenizer

Using TensorFlow backend.


In [2]:
max_words = 1000
batch_size = 32
epochs = 5

In [3]:
# https://keras.io/datasets/#reuters-newswire-topics-classification
print('Loading data...')
(x_train, y_train), (x_test, y_test) = reuters.load_data(num_words=max_words,
                                                         test_split=0.2)
print(len(x_train), 'train sequences')
print(len(x_test), 'test sequences')

Loading data...
8982 train sequences
2246 test sequences


In [4]:
# look at data
print(x_train)

[ list([1, 2, 2, 8, 43, 10, 447, 5, 25, 207, 270, 5, 2, 111, 16, 369, 186, 90, 67, 7, 89, 5, 19, 102, 6, 19, 124, 15, 90, 67, 84, 22, 482, 26, 7, 48, 4, 49, 8, 864, 39, 209, 154, 6, 151, 6, 83, 11, 15, 22, 155, 11, 15, 7, 48, 9, 2, 2, 504, 6, 258, 6, 272, 11, 15, 22, 134, 44, 11, 15, 16, 8, 197, 2, 90, 67, 52, 29, 209, 30, 32, 132, 6, 109, 15, 17, 12])
 list([1, 2, 699, 2, 2, 56, 2, 2, 9, 56, 2, 2, 81, 5, 2, 57, 366, 737, 132, 20, 2, 7, 2, 49, 2, 2, 2, 2, 699, 2, 8, 7, 10, 241, 16, 855, 129, 231, 783, 5, 4, 587, 2, 2, 2, 775, 7, 48, 34, 191, 44, 35, 2, 505, 17, 12])
 list([1, 53, 12, 284, 15, 14, 272, 26, 53, 959, 32, 818, 15, 14, 272, 26, 39, 684, 70, 11, 14, 12, 2, 18, 180, 183, 187, 70, 11, 14, 102, 32, 11, 29, 53, 44, 704, 15, 14, 19, 758, 15, 53, 959, 47, 2, 15, 14, 19, 132, 15, 39, 965, 32, 11, 14, 147, 72, 11, 180, 183, 187, 44, 11, 14, 102, 19, 11, 123, 186, 90, 67, 960, 4, 78, 13, 68, 467, 511, 110, 59, 89, 90, 67, 2, 55, 2, 92, 617, 80, 2, 46, 905, 220, 13, 4, 346, 48, 235, 6

In [5]:
print(y_train)

[ 3  4  3 ..., 25  3 25]


In [6]:
num_classes = np.max(y_train) + 1
print(num_classes, 'classes')

46 classes


In [7]:
print('Vectorizing sequence data...')
tokenizer = Tokenizer(num_words=max_words)
x_train_bin = tokenizer.sequences_to_matrix(x_train, mode='binary')
x_test_bin = tokenizer.sequences_to_matrix(x_test, mode='binary')
print('x_train shape:', x_train_bin.shape)
print('x_test shape:', x_test_bin.shape)

Vectorizing sequence data...
x_train shape: (8982, 1000)
x_test shape: (2246, 1000)


In [8]:
# look at binary-encoded seq of word features
print(x_train_bin)

[[ 0.  1.  1. ...,  0.  0.  0.]
 [ 0.  1.  1. ...,  0.  0.  0.]
 [ 0.  1.  1. ...,  0.  0.  0.]
 ..., 
 [ 0.  1.  1. ...,  0.  0.  0.]
 [ 0.  1.  1. ...,  0.  0.  0.]
 [ 0.  1.  1. ...,  0.  0.  0.]]


In [9]:
print('Convert class vector to binary class matrix '
      '(for use with categorical_crossentropy)')
y_train_bin = keras.utils.to_categorical(y_train, num_classes)
y_test_bin = keras.utils.to_categorical(y_test, num_classes)
print('y_train shape:', y_train_bin.shape)
print('y_test shape:', y_test_bin.shape)

Convert class vector to binary class matrix (for use with categorical_crossentropy)
y_train shape: (8982, 46)
y_test shape: (2246, 46)


In [10]:
print(y_train_bin)

[[ 0.  0.  0. ...,  0.  0.  0.]
 [ 0.  0.  0. ...,  0.  0.  0.]
 [ 0.  0.  0. ...,  0.  0.  0.]
 ..., 
 [ 0.  0.  0. ...,  0.  0.  0.]
 [ 0.  0.  0. ...,  0.  0.  0.]
 [ 0.  0.  0. ...,  0.  0.  0.]]


# Make model

In [11]:
print('Building model...')
model = Sequential()
model.add(Dense(512, input_shape=(max_words,)))  # num features is the number of words
model.add(Activation('relu'))
model.add(Dropout(0.5))
model.add(Dense(num_classes))
model.add(Activation('softmax'))

Building model...


In [12]:
model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

In [13]:
history = model.fit(x_train_bin, y_train_bin,
                    batch_size=batch_size,
                    epochs=epochs,
                    verbose=1,
                    validation_split=0.1)

Train on 8083 samples, validate on 899 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


# Evaluate

In [14]:
score = model.evaluate(x_test_bin, y_test_bin,
                       batch_size=batch_size, verbose=1)
print('Test score:', score[0])
print('Test accuracy:', score[1])

Test accuracy: 0.794746215494


# Sanity check

In [15]:
text = """Florida State quarterback Deondre Francois will miss the rest of the season after tearing the patellar tendon in his left knee in the fourth quarter Saturday night in a 24-7 loss to No. 1 Alabama in Atlanta.

People familiar with the situation confirmed the injury to The Associated Press on Sunday night. They spoke on condition of anonymity because of the school’s medical information policy.

The Tallahassee Democrat first reported the injury.

Coach Jimbo Fisher is expected to have a further update, including when Francois will have surgery, during his weekly press conference Monday. Fisher said after Saturday’s game that if Francois was out, James Blackman would likely be the starter."""

In [16]:
tokens = keras.preprocessing.text.text_to_word_sequence(text,
                                                        filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n',
                                                        lower=True,
                                                        split=" ")

In [17]:
print(tokens[:10])

['florida', 'state', 'quarterback', 'deondre', 'francois', 'will', 'miss', 'the', 'rest', 'of']


In [18]:
word_index = reuters.get_word_index(path="reuters_word_index.json")

In [19]:
token_ids = []
for word in tokens:
    try:
        token_id = word_index[word]
    except KeyError:
        #token_id = 10000
        pass
    token_ids.append(token_id)
token_ids = [token_ids]  # because Tokenizer.sequences_to_matrix expects list of lists

In [20]:
print(token_ids[:10])

[[2170, 275, 275, 275, 8026, 30, 6128, 1, 1350, 2, 1, 757, 89, 89, 1, 1, 1, 4, 268, 1206, 1206, 4, 1, 490, 95, 2368, 1402, 4, 7, 293, 58, 43, 3, 126, 16, 7160, 4, 4459, 869, 9275, 28, 1, 817, 1610, 1, 8088, 3, 1, 1706, 879, 18, 2377, 1402, 74, 5154, 18, 1857, 2, 7437, 158, 2, 1, 1, 1539, 986, 253, 1, 1, 2920, 96, 254, 1, 8088, 8088, 8088, 6035, 20, 130, 3, 54, 7, 223, 9432, 345, 182, 8026, 30, 54, 23426, 278, 268, 1210, 879, 519, 915, 6035, 5, 89, 89, 6574, 21, 104, 8026, 31, 239, 881, 26241, 38, 325, 27, 1, 1]]


In [21]:
tokenizer = Tokenizer(num_words=max_words)
token_ids_binary = tokenizer.sequences_to_matrix(token_ids, mode='binary')
print('x_train shape:', x_train.shape)

x_train shape: (8982,)


In [63]:
model.predict(token_ids_binary)

array([[ 0.02750144,  0.01755367,  0.00333777,  0.11415914,  0.01237924,
         0.00770436,  0.0058803 ,  0.01135649,  0.0469833 ,  0.01815013,
         0.00366627,  0.05110939,  0.01279181,  0.01228714,  0.01462033,
         0.01384593,  0.0128146 ,  0.00326411,  0.04919186,  0.05915083,
         0.01599696,  0.02273646,  0.0048194 ,  0.03132347,  0.00684222,
         0.01759172,  0.02818714,  0.01004125,  0.01873011,  0.01467679,
         0.0140591 ,  0.03050522,  0.03104342,  0.00471372,  0.07257928,
         0.01004172,  0.06032096,  0.01282194,  0.01943736,  0.00690318,
         0.02826691,  0.00627373,  0.00598172,  0.02143936,  0.00355635,
         0.00336254]], dtype=float32)

In [22]:
model.predict_classes(token_ids_binary)



array([3])

In [26]:
# now look at other class 3's
# x = numpy.array([1,0,2,0,3,0,4,5,6,7,8])
# numpy.where(x == 0)[0]
# array([1, 3, 5])

cat_3_index = np.where(y_train == 3)

In [30]:
word_index_rev = {}
for key, val in word_index.items():
    word_index_rev[val] = key

In [54]:
cat_3_index = list(cat_3_index[0])

In [62]:
for count, idx in enumerate(cat_3_index):
    if count > 9:
        break
    seq_ids = x_train[idx]
    class_3_txt = ' '.join([word_index_rev[_id] for _id in seq_ids])
    print(class_3_txt)
    print('')
    

the of of mln loss for plc said at only ended said of could 1 traders now april 0 a after said from 1985 and from foreign 000 april 0 prices its account year a but in this mln home an states earlier and rise and revs vs 000 its 16 vs 000 a but 3 of of several and shareholders and dividend vs 000 its all 4 vs 000 1 mln agreed of april 0 are 2 states will billion total and against 000 pct dlrs

the lt dlrs demand 000 reuter dividend year lt plus billion 04 000 reuter dividend year an worth new vs reuter dlrs of on shrs earnings countries new vs reuter 1985 billion vs 2 lt 4 division 000 reuter from go 000 lt plus which of 000 reuter from total 000 an 71 billion vs reuter dlr also vs shrs earnings countries 4 vs reuter 1985 from vs some now april 0 related in corp it inc strong cents dollar were after april 0 of or of more index 10 of company taking report it in estimated but trading texas said united said of a of up said countries vs 000 3 of central said which of on future of said of a 

In [68]:
# these don't look like my input text!