In [1]:

import  numpy  as  np
import  pandas  as pd
import  pickle
from  collections import defaultdict
import re
import sys
import os
from keras.datasets import reuters
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils.np_utils import to_categorical
from keras.layers import Embedding
from keras.layers import Dense, Input, Flatten
from keras.layers import Conv1D, MaxPooling1D, Embedding, Dropout, BatchNormalization, Activation, LSTM, Bidirectional
from keras.models import Model,Sequential
from keras.callbacks import ModelCheckpoint
import matplotlib.pyplot as plt
%matplotlib inline

Using TensorFlow backend.


# Loading and processing data

In [2]:
(x_train, y_train), (x_test, y_test) = reuters.load_data(path="reuters.npz",
                                                         num_words=None,
                                                         skip_top=0,
                                                         maxlen=None,
                                                         test_split=0.2,
                                                         seed=113,
                                                         start_char=1,
                                                         oov_char=2,
                                                         index_from=3)

Downloading data from https://s3.amazonaws.com/text-datasets/reuters.npz


In [3]:
y_train.shape

(8982,)

In [0]:
nb_classes=46
y_train=to_categorical(y_train, nb_classes)
y_test=to_categorical(y_test, nb_classes)

In [5]:
y_train.shape

(8982, 46)

In [6]:
word_index = reuters.get_word_index(path="reuters_word_index.json")
print('Number of Unique Tokens',len(word_index))

Downloading data from https://s3.amazonaws.com/text-datasets/reuters_word_index.json
Number of Unique Tokens 30979


In [7]:
sorted(word_index.values(), reverse=True)

[30979,
 30978,
 30977,
 30976,
 30975,
 30974,
 30973,
 30972,
 30971,
 30970,
 30969,
 30968,
 30967,
 30966,
 30965,
 30964,
 30963,
 30962,
 30961,
 30960,
 30959,
 30958,
 30957,
 30956,
 30955,
 30954,
 30953,
 30952,
 30951,
 30950,
 30949,
 30948,
 30947,
 30946,
 30945,
 30944,
 30943,
 30942,
 30941,
 30940,
 30939,
 30938,
 30937,
 30936,
 30935,
 30934,
 30933,
 30932,
 30931,
 30930,
 30929,
 30928,
 30927,
 30926,
 30925,
 30924,
 30923,
 30922,
 30921,
 30920,
 30919,
 30918,
 30917,
 30916,
 30915,
 30914,
 30913,
 30912,
 30911,
 30910,
 30909,
 30908,
 30907,
 30906,
 30905,
 30904,
 30903,
 30902,
 30901,
 30900,
 30899,
 30898,
 30897,
 30896,
 30895,
 30894,
 30893,
 30892,
 30891,
 30890,
 30889,
 30888,
 30887,
 30886,
 30885,
 30884,
 30883,
 30882,
 30881,
 30880,
 30879,
 30878,
 30877,
 30876,
 30875,
 30874,
 30873,
 30872,
 30871,
 30870,
 30869,
 30868,
 30867,
 30866,
 30865,
 30864,
 30863,
 30862,
 30861,
 30860,
 30859,
 30858,
 30857,
 30856,
 30855,


In [8]:
MAX_SEQUENCE_LENGTH=0
for i in range(x_train.shape[0]):
    if len(list(x_train)[i])>MAX_SEQUENCE_LENGTH:
        MAX_SEQUENCE_LENGTH=len(list(x_train)[i])
MAX_SEQUENCE_LENGTH

2376

In [0]:
x_train = pad_sequences(x_train, maxlen=MAX_SEQUENCE_LENGTH)

In [0]:
x_test = pad_sequences(x_test, maxlen=MAX_SEQUENCE_LENGTH)

In [11]:
x_train.shape

(8982, 2376)

# Word embedding

In [12]:
! pip install pydrive
# these classes allow you to request the Google drive API
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive 
from google.colab import auth 
from oauth2client.client import GoogleCredentials

# 1. Authenticate and create the PyDrive client.
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)
file_id = '1QhbmVQ3uoEmNvcjCViUK1ZZe0ErRjoEf'
downloaded = drive.CreateFile({'id': file_id})
# allows you to temporarily load your file in the notebook VM

# assume the file is called file.csv and it's located at the root of your drive
downloaded.GetContentFile('glove.6B.100d.txt')

Collecting pydrive
[?25l  Downloading https://files.pythonhosted.org/packages/52/e0/0e64788e5dd58ce2d6934549676243dc69d982f198524be9b99e9c2a4fd5/PyDrive-1.3.1.tar.gz (987kB)
[K    100% |████████████████████████████████| 993kB 6.5MB/s 
Building wheels for collected packages: pydrive
  Running setup.py bdist_wheel for pydrive ... [?25l- \ done
[?25h  Stored in directory: /root/.cache/pip/wheels/fa/d2/9a/d3b6b506c2da98289e5d417215ce34b696db856643bad779f4
Successfully built pydrive
Installing collected packages: pydrive
Successfully installed pydrive-1.3.1


In [13]:
embeddings_index= {}
f = open('glove.6B.100d.txt',encoding='utf8')
for line in f:
    values = line.split()
    word = values[0]
    codes = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = codes
f.close()

print('Total %s word vectors in Glove 6B 100d.' % len(embeddings_index))

Total 400000 word vectors in Glove 6B 100d.


In [0]:
vocab_len=len(word_index)+1
emb_dim=embeddings_index.get('one').shape[0]
emb_matrix=np.random.random((vocab_len,emb_dim))
for word, index in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        emb_matrix[index,:] = embedding_vector
embedding_layer = Embedding(vocab_len, emb_dim, input_length=MAX_SEQUENCE_LENGTH,trainable=True)
embedding_layer.build((None,))
embedding_layer.set_weights([emb_matrix])

In [15]:
emb_matrix.shape

(30980, 100)

# Building and compiling the model

In [16]:
sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
embeddings = embedding_layer(sequence_input)
x = Conv1D(128, 3)(embeddings)
x = BatchNormalization(axis=2)(x)
x = Activation('relu')(x)
x = MaxPooling1D(3)(x)

x = Conv1D(256, 3)(x)
x = BatchNormalization(axis=2)(x)
x = Activation('relu')(x)
x = MaxPooling1D(3)(x)

x = Conv1D(512, 3)(x)
x = BatchNormalization(axis=2)(x)
x = Activation('relu')(x)
x = MaxPooling1D(3)(x)  

x = Flatten()(x)
x = Dense(128, activation='relu')(x)
x = Dropout(0.5)(x)
x = Dense(nb_classes, activation='softmax')(x)

model = Model(sequence_input, x)
model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['acc'])

print("Simplified convolutional neural network")
model.summary()
cp=ModelCheckpoint('textClassification_cnn.hdf5',monitor='val_acc',verbose=1,save_best_only=True)

Simplified convolutional neural network
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 2376)              0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 2376, 100)         3098000   
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 2374, 128)         38528     
_________________________________________________________________
batch_normalization_1 (Batch (None, 2374, 128)         512       
_________________________________________________________________
activation_1 (Activation)    (None, 2374, 128)         0         
_________________________________________________________________
max_pooling1d_1 (MaxPooling1 (None, 791, 128)          0         
_________________________________________________________________
conv1d_2 (Conv1D)            (None, 

### Eliminating words out of word index dictionary

In [17]:
for i in range(x_train.shape[0]):
    for j in range(x_train.shape[1]):
        if x_train[i,j]>=30980:
            print(x_train[i,j])
            print (i,j)

30981
713 2077
30980
5886 2019


In [0]:
x_train[713, 2077]=0
x_train[5886, 2019]=0

# Training the model

In [19]:
history=model.fit(x_train, y_train, validation_data=(x_test, y_test),epochs=50, batch_size=32,callbacks=[cp])

Train on 8982 samples, validate on 2246 samples
Epoch 1/50

Epoch 00001: val_acc improved from -inf to 0.36153, saving model to textClassification_cnn.hdf5
Epoch 2/50

Epoch 00002: val_acc improved from 0.36153 to 0.45948, saving model to textClassification_cnn.hdf5
Epoch 3/50

Epoch 00003: val_acc did not improve from 0.45948
Epoch 4/50

Epoch 00004: val_acc did not improve from 0.45948
Epoch 5/50

Epoch 00005: val_acc did not improve from 0.45948
Epoch 6/50

Epoch 00006: val_acc did not improve from 0.45948
Epoch 7/50

Epoch 00007: val_acc improved from 0.45948 to 0.53606, saving model to textClassification_cnn.hdf5
Epoch 8/50

Epoch 00008: val_acc did not improve from 0.53606
Epoch 9/50

Epoch 00009: val_acc did not improve from 0.53606
Epoch 10/50

Epoch 00010: val_acc did not improve from 0.53606
Epoch 11/50

Epoch 00011: val_acc improved from 0.53606 to 0.54720, saving model to textClassification_cnn.hdf5
Epoch 12/50

Epoch 00012: val_acc did not improve from 0.54720
Epoch 13/50


# LSTM model

In [23]:
y = LSTM(128, return_sequences=True)(embeddings)
y = Dropout(0.5)(y)

#y = LSTM(128, return_sequences=True)(y)
#y = Dropout(0.5)(y)

y = LSTM(128, return_sequences=False)(y)
y = Dropout(0.5)(y)

y = Dense(nb_classes, activation='softmax')(y)

model_rnn = Model(sequence_input, y)
model_rnn.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['acc'])

print("Simplified recurrent neural network")
model_rnn.summary()
cp_rnn=ModelCheckpoint('textClassification_rnn.hdf5',monitor='val_acc',verbose=1,save_best_only=True)

Simplified recurrent neural network
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 2376)              0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 2376, 100)         3098000   
_________________________________________________________________
lstm_5 (LSTM)                (None, 2376, 128)         117248    
_________________________________________________________________
dropout_6 (Dropout)          (None, 2376, 128)         0         
_________________________________________________________________
lstm_6 (LSTM)                (None, 128)               131584    
_________________________________________________________________
dropout_7 (Dropout)          (None, 128)               0         
_________________________________________________________________
dense_5 (Dense)              (None, 46) 

In [24]:
history_rnn=model_rnn.fit(x_train, y_train, validation_data=(x_test, y_test),epochs=5, batch_size=16,callbacks=[cp_rnn])

Train on 8982 samples, validate on 2246 samples
Epoch 1/5

Epoch 00001: val_acc improved from -inf to 0.51959, saving model to textClassification_rnn.hdf5
Epoch 2/5

Epoch 00002: val_acc improved from 0.51959 to 0.54363, saving model to textClassification_rnn.hdf5
Epoch 3/5

Epoch 00003: val_acc did not improve from 0.54363
Epoch 4/5

Epoch 00004: val_acc did not improve from 0.54363
Epoch 5/5

Epoch 00005: val_acc did not improve from 0.54363
