In [1]:
import json
import pandas as pd
import numpy as np
import os

## Load data


In [70]:
data_dir = "../data/"
labels = ['politics','sport','health','money','local']

if os.path.exists("../data/data_df.csv"):
    data_df = pd.DataFrame.from_csv("../data/data_df.csv")
else:
    data = []
    for l in labels:
        for name in os.listdir(data_dir + l):
            article = json.load(open(data_dir + l + "/" + name))
            text = article['text']
            if text.endswith("Read More"):
                text = text[:-len("Read More")]
            art_list = [article['title'],text,l]
            data.append(art_list)
    data = np.array(data)
    data_df = pd.DataFrame(data,columns=['title','text','label'])
    data_df.to_csv("../data/data_df.csv")

## Preprocess data
1.  categories into discrete numerical values;
2.  Transform all words to lowercase;
3.  Remove all punctuations.
4.  Tokenization and stemming the tokens.

In [72]:
import string

data_df['label'] = data_df.label.map({ 'local':0, 'politics': 1, 'sport': 2, 'health': 3, 'money': 4})
data_df['title'] = data_df.title.map(
    lambda x: x.lower().translate(str.maketrans('','', string.punctuation))
)
data_df['text'] = data_df.text.map(
    lambda x: x.lower().translate(str.maketrans('','', string.punctuation))
)

from nltk.stem.porter import *
import nltk 


stemmer = PorterStemmer()

data_df['title_stem'] = data_df.title.map(
    lambda x: ' '.join([stemmer.stem(w) for w in nltk.word_tokenize(x)]))

data_df['text_stem'] = data_df.text.map(
    lambda x: ' '.join([stemmer.stem(w) for w in nltk.word_tokenize(x)[:1000]]))

In [73]:
data_df.head()

Unnamed: 0,title,text,label,title_stem,text_stem
0,a wild weekend of trump tweets,washington cnn even by his standards president...,1,a wild weekend of trump tweet,washington cnn even by hi standard presid dona...
1,scaramucci suggests trump may veto bipartisan ...,story highlights gop leaders havent commented ...,1,scaramucci suggest trump may veto bipartisan r...,stori highlight gop leader havent comment on a...
2,top state department official out after 3 months,story highlights maliz beams had been appointe...,1,top state depart offici out after 3 month,stori highlight maliz beam had been appoint as...
3,athletes call out trump after national anthem ...,new york cnn president donald trump drew the i...,1,athlet call out trump after nation anthem curr...,new york cnn presid donald trump drew the ire ...
4,chief of staff to congresswoman placed on leav...,story highlights rep brenda lawrence wrote pro...,1,chief of staff to congresswoman place on leav ...,stori highlight rep brenda lawrenc wrote propo...


In [74]:
data_df['title+text_steam'] = data_df['text_stem'].values + data_df['title_stem'].values

## Split into train and test data sets

In [75]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    data_df['title+text_steam'].values, 
    data_df['label'].values, 
    random_state = 1
)

print("Training dataset: ", X_train.shape)
print("Test dataset: ", X_test.shape)

Training dataset:  (7608,)
Test dataset:  (2537,)


## Extract features
Apply bag of words processing to the dataset

In [76]:
from sklearn.feature_extraction.text import CountVectorizer

count_vector = CountVectorizer(stop_words = 'english', ngram_range=(1, 2),  max_features=None)
training_data = count_vector.fit_transform(X_train)
testing_data = count_vector.transform(X_test)

## Train Multinomial Naive Bayes classifier

In [77]:
from sklearn.naive_bayes import MultinomialNB

naive_bayes = MultinomialNB()
naive_bayes.fit(training_data, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

## Prediction

In [78]:
predictions = naive_bayes.predict(testing_data)

In [79]:
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score

print("Accuracy score: ", accuracy_score(y_test, predictions))
print("Recall score: ", recall_score(y_test, predictions, average = 'weighted'))
print("Precision score: ", precision_score(y_test, predictions, average = 'weighted'))
print("F1 score: ", f1_score(y_test, predictions, average = 'weighted'))

Accuracy score:  0.883326763894
Recall score:  0.883326763894
Precision score:  0.892754931437
F1 score:  0.883866583967
