# Reuters - Newswire Topic Classification
The Reuters dataset contains 11,228 newswires from Reuters, labeled over 46 topics.   As with the IMDB dataset, each wire is encoded as a sequence of numbers.   

Our task is to create a neural network that can classify which topic the piece of text came from. We will use an embedding layer to input the data. 

In [1]:
import numpy as np

In [2]:
from keras.datasets import reuters
from keras.preprocessing.sequence import pad_sequences
maxlen=100
vocab_size=1000
(x_train, y_train), (x_test, y_test) = reuters.load_data(path="reuters.npz",
                                                         num_words=vocab_size, # use top 1000 frequent words
                                                         skip_top=5, # skip top frequency word
                                                         maxlen=None,
                                                         test_split=0.2,
                                                         seed=113,
                                                         start_char=1,
                                                         oov_char=2,
                                                         index_from=3)

Using TensorFlow backend.


In [3]:
x_train.shape

(8982,)

In [4]:
x_test.shape

(2246,)

In [5]:
x_train = pad_sequences(x_train, maxlen=maxlen)
x_test =  pad_sequences(x_test, maxlen=maxlen)

In [6]:
x_train.shape

(8982, 100)

In [7]:
x_train

array([[  0,   0,   0, ...,  15,  17,  12],
       [  0,   0,   0, ..., 505,  17,  12],
       [ 19, 758,  15, ...,  11,  17,  12],
       ..., 
       [  0,   0,   0, ..., 407,  17,  12],
       [ 88,   2,  72, ..., 364,  17,  12],
       [125,   2,  21, ..., 113,  17,  12]], dtype=int32)

In [8]:
x_train.max()

999

In [9]:
x_test.shape

(2246, 100)

In [10]:
y_test

array([ 3, 10,  1, ...,  3,  3, 24])

In [11]:
from keras.models import Sequential
from keras.layers import Flatten, Dense, Embedding, Dropout
from keras.layers.convolutional import Convolution1D, MaxPooling1D
model = Sequential()
model.add(Embedding(1000, 8, input_length=maxlen))
model.add(Convolution1D(nb_filter=32, filter_length=3, border_mode='same', activation='relu')) #
model.add(Flatten())
model.add(Dense(512, activation='relu')) #
model.add(Dropout(0.5)) #
model.add(Dense(46, activation='softmax'))
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['acc'])
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 100, 8)            8000      
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 100, 32)           800       
_________________________________________________________________
flatten_1 (Flatten)          (None, 3200)              0         
_________________________________________________________________
dense_1 (Dense)              (None, 512)               1638912   
_________________________________________________________________
dropout_1 (Dropout)          (None, 512)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 46)                23598     
Total params: 1,671,310
Trainable params: 1,671,310
Non-trainable params: 0
_________________________________________________________________


  


In [12]:
x_train.shape

(8982, 100)

In [13]:
x_train[0]

array([  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         2,   2,   2,   8,  43,  10, 447,   5,  25, 207, 270,   5,   2,
       111,  16, 369, 186,  90,  67,   7,  89,   5,  19, 102,   6,  19,
       124,  15,  90,  67,  84,  22, 482,  26,   7,  48,   2,  49,   8,
       864,  39, 209, 154,   6, 151,   6,  83,  11,  15,  22, 155,  11,
        15,   7,  48,   9,   2,   2, 504,   6, 258,   6, 272,  11,  15,
        22, 134,  44,  11,  15,  16,   8, 197,   2,  90,  67,  52,  29,
       209,  30,  32, 132,   6, 109,  15,  17,  12], dtype=int32)

In [14]:
y_train.shape

(8982,)

In [15]:
y_test.shape

(2246,)

In [16]:
y_train

array([ 3,  4,  3, ..., 25,  3, 25])

In [17]:
from keras.utils import np_utils # one hot encode the y-label
y_train = np_utils.to_categorical(y_train, 46)
y_test = np_utils.to_categorical(y_test, 46)

In [18]:
y_train.shape

(8982, 46)

In [19]:
y_train

array([[ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       ..., 
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.]])

In [20]:
history = model.fit(x_train, y_train, epochs=20, batch_size=32, validation_data=(x_test,y_test))

Train on 8982 samples, validate on 2246 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [21]:
score = model.evaluate(x_test, y_test)



In [22]:
score

[1.9919528770107089, 0.67008014247551206]

In [23]:
score[1]

0.67008014247551206