***Dataset:*** http://qwone.com/~jason/20Newsgroups/

In [45]:
import pandas as pd
import numpy as np
import pickle
from keras.preprocessing.text import Tokenizer
from keras.models import Sequential, load_model
from keras.layers import Activation, Dense, Dropout
from sklearn.preprocessing import LabelBinarizer
import sklearn.datasets as skds
from pathlib import Path

In [2]:
# For reproducibility
np.random.seed(1237)

# Source file directory
path_train = "datasets/20news-bydate/20news-bydate-train"
files_train = skds.load_files(path_train,load_content=False)

label_index = files_train.target
label_names = files_train.target_names
labelled_files = files_train.filenames

In [3]:
label_index

array([ 9,  4, 11, ..., 16, 18,  4])

In [4]:
label_names

['alt.atheism',
 'comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'comp.windows.x',
 'misc.forsale',
 'rec.autos',
 'rec.motorcycles',
 'rec.sport.baseball',
 'rec.sport.hockey',
 'sci.crypt',
 'sci.electronics',
 'sci.med',
 'sci.space',
 'soc.religion.christian',
 'talk.politics.guns',
 'talk.politics.mideast',
 'talk.politics.misc',
 'talk.religion.misc']

In [5]:
labelled_files

array(['datasets/20news-bydate/20news-bydate-train/rec.sport.baseball/102736',
       'datasets/20news-bydate/20news-bydate-train/comp.sys.mac.hardware/50485',
       'datasets/20news-bydate/20news-bydate-train/sci.crypt/15246', ...,
       'datasets/20news-bydate/20news-bydate-train/talk.politics.guns/54663',
       'datasets/20news-bydate/20news-bydate-train/talk.politics.misc/178534',
       'datasets/20news-bydate/20news-bydate-train/comp.sys.mac.hardware/51858'],
      dtype='<U73')

In [30]:
data_tags = ["filename","category","news"]
data_list = []

# Read and add data from file to a list
i=0
for f in labelled_files:
    with open(f, 'r', encoding='ISO-8859-1') as file:
        contents = file.read().replace('\n', ' ')
        data_list.append((f,label_names[label_index[i]],contents))
        i += 1

In [32]:
# We have training data available as dictionary filename, category, data
data = pd.DataFrame.from_records(data_list, columns=data_tags)

In [33]:
data.head()

Unnamed: 0,filename,category,news
0,datasets/20news-bydate/20news-bydate-train/rec...,rec.sport.baseball,From: cubbie@garnet.berkeley.edu ( ...
1,datasets/20news-bydate/20news-bydate-train/com...,comp.sys.mac.hardware,From: gnelson@pion.rutgers.edu (Gregory Nelson...
2,datasets/20news-bydate/20news-bydate-train/sci...,sci.crypt,From: crypt-comments@math.ncsu.edu Subject: Cr...
3,datasets/20news-bydate/20news-bydate-train/com...,comp.sys.mac.hardware,From: () Subject: Re: Quadra SCSI Problems???...
4,datasets/20news-bydate/20news-bydate-train/alt...,alt.atheism,From: keith@cco.caltech.edu (Keith Allan Schne...


In [34]:
# lets take 80% data as training and remaining 20% for test.
train_size = int(len(data) * .8)

train_posts = data['news'][:train_size]
train_tags = data['category'][:train_size]
train_files_names = data['filename'][:train_size]

test_posts = data['news'][train_size:]
test_tags = data['category'][train_size:]
test_files_names = data['filename'][train_size:]

In [35]:
# 20 news groups
num_labels = 20
vocab_size = 15000
batch_size = 100

# define Tokenizer with Vocab Size
tokenizer = Tokenizer(num_words=vocab_size)
tokenizer.fit_on_texts(train_posts)

x_train = tokenizer.texts_to_matrix(train_posts, mode='tfidf')
x_test = tokenizer.texts_to_matrix(test_posts, mode='tfidf')

encoder = LabelBinarizer()
encoder.fit(train_tags)
y_train = encoder.transform(train_tags)
y_test = encoder.transform(test_tags)

In [39]:
x_train[0]

array([ 0.        ,  1.89863301,  1.2686438 , ...,  0.        ,
        0.        ,  0.        ])

In [38]:
y_train[0]

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

In [40]:
model = Sequential()
model.add(Dense(512, input_shape=(vocab_size,)))
model.add(Activation('relu'))
model.add(Dropout(0.3))
model.add(Dense(512))
model.add(Activation('relu'))
model.add(Dropout(0.3))
model.add(Dense(num_labels))
model.add(Activation('softmax'))
model.summary()

model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

history = model.fit(x_train, y_train,
                    batch_size=batch_size,
                    epochs=30,
                    verbose=1,
                    validation_split=0.1)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_1 (Dense)              (None, 512)               7680512   
_________________________________________________________________
activation_1 (Activation)    (None, 512)               0         
_________________________________________________________________
dropout_1 (Dropout)          (None, 512)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 512)               262656    
_________________________________________________________________
activation_2 (Activation)    (None, 512)               0         
_________________________________________________________________
dropout_2 (Dropout)          (None, 512)               0         
_________________________________________________________________
dense_3 (Dense)              (None, 20)                10260     
__________

In [41]:
score = model.evaluate(x_test, y_test,
                       batch_size=batch_size, verbose=1)

print('Test accuracy:', score[1])

text_labels = encoder.classes_

for i in range(10):
    prediction = model.predict(np.array([x_test[i]]))
    predicted_label = text_labels[np.argmax(prediction[0])]
    print(test_files_names.iloc[i])
    print('Actual label:' + test_tags.iloc[i])
    print("Predicted label: " + predicted_label)

Test accuracy: 0.87936367701
datasets/20news-bydate/20news-bydate-train/alt.atheism/53114
Actual label:alt.atheism
Predicted label: alt.atheism
datasets/20news-bydate/20news-bydate-train/comp.graphics/38666
Actual label:comp.graphics
Predicted label: comp.graphics
datasets/20news-bydate/20news-bydate-train/sci.med/58932
Actual label:sci.med
Predicted label: sci.med
datasets/20news-bydate/20news-bydate-train/sci.crypt/15212
Actual label:sci.crypt
Predicted label: sci.crypt
datasets/20news-bydate/20news-bydate-train/comp.os.ms-windows.misc/9695
Actual label:comp.os.ms-windows.misc
Predicted label: comp.os.ms-windows.misc
datasets/20news-bydate/20news-bydate-train/rec.sport.baseball/104482
Actual label:rec.sport.baseball
Predicted label: rec.sport.baseball
datasets/20news-bydate/20news-bydate-train/soc.religion.christian/20731
Actual label:soc.religion.christian
Predicted label: sci.med
datasets/20news-bydate/20news-bydate-train/comp.graphics/38583
Actual label:comp.graphics
Predicted lab

In [43]:
# creates a HDF5 file
model.model.save('models/20newsgroups_model.h5')

# Save Tokenizer i.e. Vocabulary
with open('models/20newsgroups_tokenizer.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [46]:
# load our saved model
model = load_model('models/20newsgroups_model.h5')

# load tokenizer
tokenizer = Tokenizer()
with open('models/20newsgroups_tokenizer.pickle', 'rb') as handle:
    tokenizer = pickle.load(handle)

In [47]:
encoder.classes_ #LabelBinarizer

array(['alt.atheism', 'comp.graphics', 'comp.os.ms-windows.misc',
       'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware',
       'comp.windows.x', 'misc.forsale', 'rec.autos', 'rec.motorcycles',
       'rec.sport.baseball', 'rec.sport.hockey', 'sci.crypt',
       'sci.electronics', 'sci.med', 'sci.space', 'soc.religion.christian',
       'talk.politics.guns', 'talk.politics.mideast', 'talk.politics.misc',
       'talk.religion.misc'],
      dtype='<U24')

In [48]:
# These are the labels we stored from our training
# The order is very important here.

labels = np.array(['alt.atheism', 'comp.graphics', 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware', 'comp.windows.x',
 'misc.forsale', 'rec.autos', 'rec.motorcycles', 'rec.sport.baseball',
 'rec.sport.hockey', 'sci.crypt', 'sci.electronics', 'sci.med', 'sci.space',
 'soc.religion.christian', 'talk.politics.guns', 'talk.politics.mideast',
 'talk.politics.misc', 'talk.religion.misc'])

test_files = ["datasets/20news-bydate/20news-bydate-test/comp.graphics/38758",
              "datasets/20news-bydate/20news-bydate-test/misc.forsale/76115",
              "datasets/20news-bydate/20news-bydate-test/soc.religion.christian/21329"
              ]
x_data = []
for t_f in test_files:
    t_f_data = Path(t_f).read_text()
    x_data.append(t_f_data)

x_data_series = pd.Series(x_data)
x_tokenized = tokenizer.texts_to_matrix(x_data_series, mode='tfidf')

i=0
for x_t in x_tokenized:
    prediction = model.predict(np.array([x_t]))
    predicted_label = labels[np.argmax(prediction[0])]
    print("File ->", test_files[i], "Predicted label: " + predicted_label)
    i += 1

File -> datasets/20news-bydate/20news-bydate-test/comp.graphics/38758 Predicted label: comp.graphics
File -> datasets/20news-bydate/20news-bydate-test/misc.forsale/76115 Predicted label: misc.forsale
File -> datasets/20news-bydate/20news-bydate-test/soc.religion.christian/21329 Predicted label: soc.religion.christian
