In [None]:
import numpy as np
import pandas as pd

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
data = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/News classification LSTM/uci-news-aggregator.csv')
data.head()

Unnamed: 0,ID,TITLE,URL,PUBLISHER,CATEGORY,STORY,HOSTNAME,TIMESTAMP
0,1,"Fed official says weak data caused by weather,...",http://www.latimes.com/business/money/la-fi-mo...,Los Angeles Times,b,ddUyU0VZz0BRneMioxUPQVP6sIxvM,www.latimes.com,1394470370698
1,2,Fed's Charles Plosser sees high bar for change...,http://www.livemint.com/Politics/H2EvwJSK2VE6O...,Livemint,b,ddUyU0VZz0BRneMioxUPQVP6sIxvM,www.livemint.com,1394470371207
2,3,US open: Stocks fall after Fed official hints ...,http://www.ifamagazine.com/news/us-open-stocks...,IFA Magazine,b,ddUyU0VZz0BRneMioxUPQVP6sIxvM,www.ifamagazine.com,1394470371550
3,4,"Fed risks falling 'behind the curve', Charles ...",http://www.ifamagazine.com/news/fed-risks-fall...,IFA Magazine,b,ddUyU0VZz0BRneMioxUPQVP6sIxvM,www.ifamagazine.com,1394470371793
4,5,Fed's Plosser: Nasty Weather Has Curbed Job Gr...,http://www.moneynews.com/Economy/federal-reser...,Moneynews,b,ddUyU0VZz0BRneMioxUPQVP6sIxvM,www.moneynews.com,1394470372027


In [None]:
data = data[['TITLE', 'CATEGORY']]
data.head()

Unnamed: 0,TITLE,CATEGORY
0,"Fed official says weak data caused by weather,...",b
1,Fed's Charles Plosser sees high bar for change...,b
2,US open: Stocks fall after Fed official hints ...,b
3,"Fed risks falling 'behind the curve', Charles ...",b
4,Fed's Plosser: Nasty Weather Has Curbed Job Gr...,b


In [None]:
data.shape

(422419, 2)

In [None]:
data['TITLE'].unique()

array(['Fed official says weak data caused by weather, should not slow taper',
       "Fed's Charles Plosser sees high bar for change in pace of tapering",
       'US open: Stocks fall after Fed official hints at accelerated tapering',
       ...,
       'Child who swallowed battery to have reconstructive surgery at Cincinnati  ...',
       'Phoenix boy undergoes surgery to repair throat damage - WFSB 3 Connecticut',
       'Phoenix boy undergoes surgery to repair throat damage - CBS 3 Springfield  ...'],
      dtype=object)

In [None]:
data['CATEGORY'].unique()

array(['b', 't', 'e', 'm'], dtype=object)

In [None]:
data.isnull().sum()

TITLE       0
CATEGORY    0
dtype: int64

In [None]:
data['CATEGORY'].value_counts()

e    152469
b    115967
t    108344
m     45639
Name: CATEGORY, dtype: int64

In [None]:
# Shuffling data as same class rows are grouped together
n_categories = 45000
shuffled = data.reindex(np.random.permutation(data.index))

# taking equal (45000) no. of rows/categories for each class & creating separate dataframes for each class
e = shuffled[shuffled['CATEGORY']=='e'][:n_categories]
b = shuffled[shuffled['CATEGORY']=='b'][:n_categories]
t = shuffled[shuffled['CATEGORY']=='t'][:n_categories]
m = shuffled[shuffled['CATEGORY']=='m'][:n_categories]

concat_data = pd.concat([e,b,t,m], ignore_index=True) # concats the four dataframes vertical

# Shuffle the dataset again
concat_data = concat_data.reindex(np.random.permutation(concat_data.index))
concat_data.head()
print(concat_data.shape)

# This shuffling increases generalization

(180000, 2)


In [None]:
from keras.utils import to_categorical

concat_data.loc[concat_data['CATEGORY']=='e', 'LABEL'] = 0
concat_data.loc[concat_data['CATEGORY']=='b', 'LABEL'] = 1
concat_data.loc[concat_data['CATEGORY']=='t', 'LABEL'] = 2
concat_data.loc[concat_data['CATEGORY']=='m', 'LABEL'] = 3

print(concat_data['LABEL'][:10])

labels = to_categorical(concat_data['LABEL'], num_classes=4)
print(labels[:10])

if 'CATEGORY' in concat_data.keys():
  concat_data.drop(['CATEGORY'], axis=1, inplace=True)

print(concat_data.head())
print(concat_data.shape)

51892     1.0
63896     1.0
107053    2.0
168658    3.0
151923    3.0
81408     1.0
96627     2.0
80250     1.0
70738     1.0
17212     0.0
Name: LABEL, dtype: float64
[[0. 1. 0. 0.]
 [0. 1. 0. 0.]
 [0. 0. 1. 0.]
 [0. 0. 0. 1.]
 [0. 0. 0. 1.]
 [0. 1. 0. 0.]
 [0. 0. 1. 0.]
 [0. 1. 0. 0.]
 [0. 1. 0. 0.]
 [1. 0. 0. 0.]]
                                                    TITLE  LABEL
51892               Latest info on search for missing jet    1.0
63896            Study finds that quakes tied to drilling    1.0
107053    Can "Titanfall" Help Save Microsoft's Xbox One?    2.0
168658          Colonoscopy can prevent Colorectal cancer    3.0
151923  Oscar Pistorius murder trial: Recap after expe...    3.0
(180000, 2)


In [None]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

n_most_freq_words =  8000
max_length = 130

tokenizer = Tokenizer(num_words=n_most_freq_words, filters='!"#$%&()*+,-./:;<=>?@[\]^_{|}~', lower=True)  # num_words: the maximum number of words to keep, based on word frequency. Only the most common num_words-1 words will be kept.
tokenizer.fit_on_texts(concat_data['TITLE'].values)  # Updates internal vocabulary based on a list of texts. In the case where texts contains lists, we assume each entry of the lists to be a token.
sequences = tokenizer.texts_to_sequences(concat_data['TITLE'].values)  # Transforms each text in texts to a sequence of integers.
word_index = tokenizer.word_index
print(f'Found {len(word_index)} unique tokens')

X = pad_sequences(sequences, maxlen=max_length)
# Pads sequences to the same length.
# This function transforms a list (of length num_samples) of sequences (lists of integers) into a 2D Numpy array of shape (num_samples, maxlen)
# Sequences that are shorter than maxlen are padded with value until they are maxlen long.
# Sequences longer than maxlen are truncated so that they fit the desired length.

print(X)
print(X.shape)

Found 52589 unique tokens
[[   0    0    0 ...    5  493  640]
 [   0    0    0 ... 1658    1 4570]
 [   0    0    0 ...  843  215   40]
 ...
 [   0    0    0 ...  468  400 2755]
 [   0    0    0 ...   17 1501 2046]
 [   0    0    0 ...    1  253  898]]
(180000, 130)


In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, labels, test_size=0.25, random_state=42)

In [None]:
print((X_train.shape, y_train.shape, X_test.shape, y_test.shape))

((135000, 130), (135000, 4), (45000, 130), (45000, 4))


In [None]:
# Hyperparameters
epochs = 10
emb_dim = 128
batch_size = 128

In [None]:
# We can use Embedding in two ways:
# 1) Directly add the Embedding layer in our neural network
# 2) Train our data for embedding prior only and then use that features & weights in the embedding layer of our neural network

# We use (1) here

In [None]:
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, SpatialDropout1D
from keras.callbacks import EarlyStopping

model = Sequential()
model.add(Embedding(n_most_freq_words, emb_dim, input_length=X.shape[1]))
model.add(SpatialDropout1D(0.7))
model.add(LSTM(64, dropout=0.7, recurrent_dropout=0.7))
model.add(Dense(4, activation='softmax'))

model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['acc'])

print(model.summary())

# >>> model = tf.keras.Sequential()
# >>> model.add(tf.keras.layers.Embedding(1000, 64, input_length=10))
# >>> # The model will take as input an integer matrix of size (batch,
# >>> # input_length), and the largest integer (i.e. word index) in the input
# >>> # should be no larger than 999 (vocabulary size).
# >>> # Now model.output_shape is (None, 10, 64), where `None` is the batch
# >>> # dimension.

Model: "sequential_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_2 (Embedding)     (None, 130, 128)          1024000   
                                                                 
 spatial_dropout1d_2 (Spati  (None, 130, 128)          0         
 alDropout1D)                                                    
                                                                 
 lstm_2 (LSTM)               (None, 64)                49408     
                                                                 
 dense_2 (Dense)             (None, 4)                 260       
                                                                 
Total params: 1073668 (4.10 MB)
Trainable params: 1073668 (4.10 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
None


In [None]:
history = model.fit(X_train, y_train, epochs=epochs, batch_size=batch_size, validation_split=0.2, callbacks=[EarlyStopping(monitor='val_loss')], verbose=1)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10


In [None]:
test_loss, test_acc = model.evaluate(X_test, y_test, verbose = 1)

print(f'test set loss: {test_loss}, test set accuracy: {test_acc}')

In [None]:
import matplotlib.pyplot as plt
acc = history.history['acc']
val_acc = history.history['val_acc']
val_loss = history.history['val_loss']
loss = history.history['loss']

epochs = range(1, len(acc)+1)

plt.plot(epochs, acc, 'bo', label='Training Accuracy')
plt.plot(epochs, val_acc, 'bo', label='Validation Accuracy')
plt.title('Training and Validation Accuracy')
plt.legend()

plt.figure( )

plt.plot(epochs, loss, 'bo', label='Training Loss')
plt.plot(epochs, val_loss, 'bo', label='Validation Loss')
plt.title('Training and Validation Loss')
plt.legend()

plt.show()