# Reuters News Sentiment Analysis
Reuters newswire topics classification, the dataset contains 11,228 newswires from Reuters, labeled over 46 topics.   As with the IMDB dataset, each wire is encoded as a sequence of numbers.   

You task is to create a neural network that can classify which topic the price of text came from. Use an embedding layer to input the data. 

In [1]:
import numpy as np
from gensim.models import Word2Vec, Phrases#prebuilt word to vec implementation
import glob #finds all pathnames matching a pattern, like regex
import codecs #unicode support when reading files
from multiprocessing import cpu_count #use to get number of cpus on host machine
from gensim.utils import simple_preprocess,simple_tokenize #text processing
from string import punctuation #string  containing all puncuation

In [2]:
from keras.datasets import reuters
maxlen = 100 # maximum words in a sentence
vocab_size = 15000  # top 15000 most frequent words
(x_train, y_train), (x_test, y_test) = reuters.load_data(path="reuters.npz",
                                                         num_words=vocab_size,
                                                         skip_top=0,
                                                         maxlen=maxlen,
                                                         test_split=0.2,
                                                         seed=113,
                                                         start_char=1,
                                                         oov_char=2,
                                                         index_from=3)

Using TensorFlow backend.


In [96]:
print (y_train.shape)
print (x_train.shape)
print (y_test.shape)
print (x_test.shape)

(4777,)
(4777,)
(1195,)
(1195,)


In [44]:
# Checking maximum sentence length
length = []
for i in x_train:
    length.append(len(i))
sorted(length, reverse = True)
max(length)

99

In [224]:
x_train.shape

(4777,)

In [225]:
# Number of sentence
max_l = []
for i in range(len(x_train)):
    max_l.append(max(x_train[i]))
len(max_l)

4777

In [3]:
# Converting from index to word using reuters json

word_index = reuters.get_word_index(path="reuters_word_index.json")
index_to_word = {(y+3):x for x,y in word_index.items()}  
#x_train1 = x_train[0:10]
l = []
articles = []
for sentence in x_train:
    for index in sentence:
        if index_to_word.get(index) != None:
#            print (index_to_word.get(index))
            l.append(index_to_word.get(index))
#    print (l)
    articles.append(l)
    l = []
len(articles)

4777

In [4]:
index_to_word.get(30924)

'bbusx'

In [8]:
# All words (in number form) in one list
l = []
for i in x_train:
    l.extend(i)

In [6]:
len(set(l))  # number of unique words

9142

In [179]:
Word2Vec?

In [187]:
workers = cpu_count()
model = Word2Vec(articles, size=100, window=5, min_count=0, workers=workers, max_vocab_size = 15000) #fit model

In [200]:
corpus = []
for article in articles:
    corpus.extend(article)

In [202]:
word2int = {}

for word in corpus:
    if word not in word2int:
        word2int[word] = len(word2int)

In [212]:
word2int = { word:i for i , word in enumerate(model.wv.index2word)}

In [216]:
x_in = [[ word2int[word] for word in article] for article in articles]

In [11]:
from collections import Counter

In [12]:
c = Counter(l)  # counts the frequency of each word and output it in dictionary

In [14]:
c

Counter({1: 4777,
         2: 4153,
         8: 4844,
         43: 455,
         10: 3867,
         447: 52,
         5: 6573,
         25: 1702,
         207: 179,
         270: 289,
         3095: 9,
         111: 550,
         16: 2685,
         369: 145,
         186: 188,
         90: 528,
         67: 778,
         7: 4154,
         89: 410,
         19: 3533,
         102: 648,
         6: 4627,
         124: 559,
         15: 3377,
         84: 200,
         22: 1371,
         482: 128,
         26: 5166,
         48: 898,
         4: 9516,
         49: 1133,
         864: 59,
         39: 3075,
         209: 45,
         154: 107,
         151: 593,
         83: 789,
         11: 6620,
         155: 437,
         9: 3957,
         4579: 5,
         1005: 44,
         504: 101,
         258: 314,
         272: 292,
         134: 560,
         44: 1536,
         197: 232,
         1245: 15,
         52: 348,
         29: 1341,
         30: 697,
         32: 2290,
         132: 3

In [24]:
#np.sum(list(c.values(30)))
c.values()

dict_values([4777, 4153, 4844, 455, 3867, 52, 6573, 1702, 179, 289, 9, 550, 2685, 145, 188, 528, 778, 4154, 410, 3533, 648, 4627, 559, 3377, 200, 1371, 128, 5166, 898, 9516, 1133, 59, 3075, 45, 107, 593, 789, 6620, 437, 3957, 5, 44, 101, 314, 292, 560, 1536, 232, 15, 348, 1341, 697, 2290, 394, 408, 4747, 6499, 5, 35, 9, 20, 840, 3, 1, 15, 915, 40, 263, 114, 23, 1704, 4, 3, 28, 119, 48, 213, 163, 75, 117, 66, 730, 400, 1356, 16, 125, 478, 2688, 246, 9076, 65, 20, 19, 1218, 1580, 138, 1072, 76, 1166, 978, 548, 71, 125, 64, 232, 137, 160, 138, 2566, 94, 206, 30, 6, 2, 118, 87, 323, 49, 137, 317, 1461, 35, 45, 99, 187, 288, 539, 1425, 453, 245, 13, 15, 11, 19, 16, 11, 119, 232, 6, 8, 307, 1038, 108, 157, 559, 558, 99, 8, 134, 19, 20, 60, 51, 585, 13, 3, 497, 57, 63, 23, 31, 266, 96, 264, 19, 111, 61, 3, 61, 709, 330, 23, 26, 887, 82, 148, 15, 2466, 52, 6, 152, 197, 189, 189, 677, 1055, 308, 11, 12, 12, 6, 4, 1, 39, 35, 261, 57, 80, 114, 1208, 27, 96, 377, 122, 116, 26, 28, 137, 10, 92, 14,

In [139]:
c[8]

4844

In [52]:
# model.wv['bbusx'] not in vocab. bbusx : 30924
def word_count(index):
    zz = 0
    for x in x_train:
        for y in x:
            if y == index:
                zz += 1
    print (zz)

word_count(1400)

15


In [102]:
model.wv.vocab

{'said': <gensim.models.keyedvectors.Vocab at 0x1a267111d0>,
 'as': <gensim.models.keyedvectors.Vocab at 0x1a267112b0>,
 'a': <gensim.models.keyedvectors.Vocab at 0x1a267110b8>,
 'result': <gensim.models.keyedvectors.Vocab at 0x1a26711470>,
 'of': <gensim.models.keyedvectors.Vocab at 0x1a26c36b00>,
 'its': <gensim.models.keyedvectors.Vocab at 0x119805908>,
 'december': <gensim.models.keyedvectors.Vocab at 0x1a255cde48>,
 'acquisition': <gensim.models.keyedvectors.Vocab at 0x1a26cf7400>,
 'space': <gensim.models.keyedvectors.Vocab at 0x1a26cf74a8>,
 'co': <gensim.models.keyedvectors.Vocab at 0x1a26cf7860>,
 'it': <gensim.models.keyedvectors.Vocab at 0x1a26cf75f8>,
 'expects': <gensim.models.keyedvectors.Vocab at 0x1a26cf7a20>,
 'earnings': <gensim.models.keyedvectors.Vocab at 0x1a26cf7630>,
 'per': <gensim.models.keyedvectors.Vocab at 0x1a26cf79b0>,
 'share': <gensim.models.keyedvectors.Vocab at 0x1a26cf7518>,
 'in': <gensim.models.keyedvectors.Vocab at 0x1a26cf7550>,
 '1987': <gensim.m

In [114]:
word_vectors = model.wv

In [188]:
len(model.wv.vocab)

9140

In [60]:
model.wv.most_similar(positive=['earnings', 'oil'], negative = ['share'])

[('invitation', 0.9318768978118896),
 ('confident', 0.9276747703552246),
 ('recognize', 0.9258270263671875),
 ('commissioned', 0.9223984479904175),
 ('barter', 0.9219925999641418),
 ('experience', 0.9202845692634583),
 ('relationships', 0.9143248796463013),
 ('committee', 0.9122007489204407),
 ('confidence', 0.9117917418479919),
 ('parliament', 0.9110192060470581)]

In [120]:
model.wv.most_similar(positive=['earnings', 'oil'], negative = ['share'])

[('demand', 0.6610552668571472),
 ('output', 0.6451716423034668),
 ('fuel', 0.6301980018615723),
 ('natural', 0.6207156181335449),
 ('consumption', 0.6057217121124268),
 ('production', 0.5986776351928711),
 ('inventories', 0.5870170593261719),
 ('improved', 0.5747835636138916),
 ('declining', 0.5628439784049988),
 ('employment', 0.562535285949707)]

In [61]:
#shape of the output embdding matrix
model.wv.vectors.shape
#model.wv.syn0.shape

# For skip gram, your input matrix and output matrix use
# syn0 and syn1 (or syn1neg, for negative sampling)
# Detail: https://groups.google.com/forum/#!searchin/gensim/access$20input$20embeddings%7Csort:relevance/gensim/TzlqaVdZ_FA/y89Tt6kPGwAJ

(9140, 100)

In [270]:
len(model.wv.vocab)

10923

In [62]:
model.wv.vectors

array([[  2.53322367e-02,  -5.58925688e-01,  -2.84950703e-01, ...,
         -2.33507887e-01,   9.15773332e-01,   3.52596790e-01],
       [ -6.82448566e-01,   1.96484530e+00,  -1.52149528e-01, ...,
          2.52318323e-01,  -3.28187019e-01,  -3.71681303e-02],
       [  1.80127263e-01,  -3.22149873e-01,  -9.84523475e-01, ...,
         -5.03740311e-01,  -6.45654619e-01,   9.88516212e-01],
       ..., 
       [  2.65793875e-02,   1.39335205e-03,   5.51953353e-03, ...,
         -3.10676545e-03,  -1.03200767e-02,   6.11582072e-03],
       [ -3.23678087e-03,  -1.28395269e-02,  -1.11801876e-02, ...,
          3.29506816e-04,   6.34430675e-03,   9.30364244e-03],
       [  6.37674378e-03,  -7.83753209e-03,  -9.92756989e-03, ...,
          2.58355169e-03,   9.29057656e-04,   3.67837842e-03]], dtype=float32)

## Neural Network

In [88]:
from sklearn.preprocessing import LabelEncoder
from keras.utils import np_utils
encoder = LabelEncoder()
encoder.fit(y_train)
encoded_Y = encoder.transform(y_train)
# convert integers to dummy variables (i.e. one hot encoded)
dummy_y_train = np_utils.to_categorical(encoded_Y)

In [89]:
encoder1 = LabelEncoder()
encoder1.fit(y_test)
encoded1_Y = encoder.transform(y_test)
# convert integers to dummy variables (i.e. one hot encoded)
dummy_y_test = np_utils.to_categorical(encoded1_Y)

In [71]:
len(dummy_y[0])

46

In [75]:
print (y_train[0])
print (y_test[0])

3
23


In [83]:
print (y_train)
print (y_test)
print (dummy_y)

[3 4 3 ..., 5 4 3]
[23  3 19 ...,  3  3  3]
[[ 0.  0.  0. ...,  0.  0.  0.]
 [ 0.  0.  0. ...,  0.  0.  0.]
 [ 0.  0.  0. ...,  0.  0.  0.]
 ..., 
 [ 0.  0.  0. ...,  0.  0.  0.]
 [ 0.  0.  0. ...,  0.  0.  0.]
 [ 0.  0.  0. ...,  0.  0.  0.]]


In [84]:
print (len(np.unique(y_train)))
print (len(np.unique(y_test)))
print (len(np.unique(dummy_y)))

46
42
2


In [86]:
from keras import utils
print('Convert class vector to binary class matrix '
      '(for use with categorical_crossentropy)')
dummy_y_train = utils.to_categorical(y_train, len(np.unique(y_train)))
dummy_y_test = utils.to_categorical(y_test, len(np.unique(y_test)))
#print('y_train shape:', dummy_y_train.shape)
print('y_test shape:', dummy_y_test.shape)

Convert class vector to binary class matrix (for use with categorical_crossentropy)
y_train shape: (4777, 46)


In [90]:
dummy_y_train.shape

(4777, 46)

In [91]:
from keras.preprocessing.text import Tokenizer
tokenizer = Tokenizer(num_words= len(model.wv.vocab))
x_train_w2v = tokenizer.sequences_to_matrix(x_train, mode='binary')
x_test_w2v = tokenizer.sequences_to_matrix(x_test, mode='binary')
print('x_train shape:', x_train_w2v.shape)
print('x_test shape:', x_test_w2v.shape)

x_train shape: (4777, 9140)
x_test shape: (1195, 9140)


In [141]:
np.unique(x_train_w2v)

array([ 0.,  1.])

In [161]:
model.wv.vectors.shape

(9140, 100)

In [150]:
from keras.preprocessing.sequence import pad_sequences
x_train_pad = pad_sequences(x_train, maxlen= 100)
x_test_pad = pad_sequences(x_test, maxlen= 100)

In [217]:
from keras.preprocessing.sequence import pad_sequences
x_in_pad = pad_sequences(x_in, maxlen= 100)


In [92]:
x_train_w2v.shape

(4777, 9140)

In [164]:
Embedding?

In [220]:
from keras.layers import Embedding

embedding_layer = Embedding(9140,#size of input seq. number of words in each sentence
                            100,#size of embediing dims. (9140 vocab, 100 dimension)
                            weights=[model.wv.vectors],
                            input_length= 100,
                            trainable=True)

# Embedding(input_dim, output_dim, init='uniform', input_length=None, W_regularizer=None, activity_regularizer=None, W_constraint=None, mask_zero=False, weights=None, dropout=0.0)

# input_dim: int >= 0. Vocabulary size, ie. 1 + maximum integer index occurring in the input data.
# output_dim: int >= 0. Dimension of the dense embedding. 100 Dim
# input_length: number of words to use from each review. Length of input sequences, when it is constant. This argument is required if you are going to connect  Flatten then Dense layers upstream (without it, the shape of the dense outputs cannot be computed).
# weights: list of numpy arrays to set as initial weights. The list should have 1 element, of shape (input_dim, output_dim).


# https://faroit.github.io/keras-docs/1.0.1/layers/embeddings/

In [221]:
from keras.models import Sequential
from keras.layers import Flatten, Dense, Embedding, Conv1D
m = Sequential()
m.add(embedding_layer)
m.add(Flatten())
m.add(Dense(46, activation = 'sigmoid')) # 46 = len(np.unique(y_train))
m.compile(optimizer='Adadelta', loss= 'categorical_crossentropy', metrics=['acc'])

m.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_11 (Embedding)     (None, 100, 100)          914000    
_________________________________________________________________
flatten_5 (Flatten)          (None, 10000)             0         
_________________________________________________________________
dense_4 (Dense)              (None, 46)                460046    
Total params: 1,374,046
Trainable params: 1,374,046
Non-trainable params: 0
_________________________________________________________________


In [222]:
history = m.fit(x_in_pad, dummy_y_train, epochs = 30, batch_size = 32, validation_split = 0.2)

Train on 3821 samples, validate on 956 samples
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


In [373]:
history.history['acc']

[0.35267919278343934,
 0.3533750870075173,
 0.35337508698885195,
 0.3533750870075173,
 0.3533750870075173,
 0.3533750870075173,
 0.35337508704484794,
 0.35337508699092585,
 0.35337508699714765,
 0.35337508699922154]

In [375]:
score = m.evaluate(x_test_w2v, dummy_y_test,
                       batch_size=32, verbose=1)



In [376]:
print('Test score:', score[0])
print('Test accuracy:', score[1])

Test score: 10.2837180713
Test accuracy: 0.361976847782


## Other: Using just Keras embedding

In [25]:
from keras.datasets import reuters
maxlen = 100
vocab_size = 15000
(x_train_simple, y_train_simple), (x_test_simple, y_test_simple) = reuters.load_data(path="reuters.npz",
                                                         num_words=vocab_size,
                                                         skip_top=0,
                                                         maxlen=maxlen,
                                                         test_split=0.2,
                                                         seed=113,
                                                         start_char=1,
                                                         oov_char=2,
                                                         index_from=3)

In [26]:
from keras.preprocessing.sequence import pad_sequences
x_train_simple = pad_sequences(x_train_simple, maxlen= maxlen)
x_test_simple =  pad_sequences(x_test_simple, maxlen= maxlen)

In [27]:
x_train_simple.shape

(4777, 100)

In [28]:
x_train_simple

array([[   0,    0,    0, ...,   15,   17,   12],
       [   0,    0,    0, ...,  505,   17,   12],
       [   0,    0,    0, ...,  254,   17,   12],
       ..., 
       [   0,    0,    0, ...,   76,   17,   12],
       [   0,    0,    0, ...,   15,   17,   12],
       [   0,    0,    0, ..., 8574,   17,   12]], dtype=int32)

In [34]:
from sklearn.preprocessing import LabelEncoder
from keras.utils import np_utils
encoder = LabelEncoder()
encoder.fit(y_train_simple)
encoded_Y = encoder.transform(y_train_simple)
# convert integers to dummy variables (i.e. one hot encoded)
y_train_simple = np_utils.to_categorical(encoded_Y)

In [30]:
from keras.models import Sequential
from keras.layers import Flatten, Dense, Embedding
m1 = Sequential()
m1.add(Embedding(14999, 8, input_length = maxlen)) #10000 for vocab size, 8 for dimensionality of embedding
m1.add(Flatten())
#m1.add(Dense(30, input_dim=20, activation='relu'))
m1.add(Dense(46, activation='softmax'))
m1.compile(optimizer='Adadelta', loss= 'categorical_crossentropy', metrics=['acc'])
m1.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 100, 8)            119992    
_________________________________________________________________
flatten_2 (Flatten)          (None, 800)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 46)                36846     
Total params: 156,838
Trainable params: 156,838
Non-trainable params: 0
_________________________________________________________________


In [31]:
np.unique(y_train)

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
       17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33,
       34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45])

In [36]:
history1 = m1.fit(x_train_simple, y_train_simple,
epochs=30,
batch_size=32,
validation_split=0.2)

Train on 3821 samples, validate on 956 samples
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30
