In [3]:
import json
import pandas as pd
import numpy as np
import os

## Load data


In [4]:
data_dir = "../data/"
labels = ['politics','sport','health','money','local']

if os.path.exists("../data/data_df.csv"):
    data_df = pd.DataFrame.from_csv("../data/data_df.csv")
else:
    data = []
    for l in labels:
        for name in os.listdir(data_dir + l):
            article = json.load(open(data_dir + l + "/" + name))
            art_list = [article['title'],article['text'],l]
            data.append(art_list)
    data = np.array(data)
    data_df = pd.DataFrame(data,columns=['title','text','label'])
    data_df.to_csv("../data/data_df.csv")

## Preprocess data
1.  categories into discrete numerical values;
2.  Transform all words to lowercase;
3.  Remove all punctuations.

In [16]:
import string

data_df['label'] = data_df.label.map({ 'local':0, 'politics': 1, 'sport': 2, 'health': 3, 'money': 4})
data_df['title'] = data_df.title.map(
    lambda x: x.lower().translate(str.maketrans('','', string.punctuation))
)
data_df['text'] = data_df.text.map(
    lambda x: x.lower().translate(str.maketrans('','', string.punctuation))
)
data_df.head()

Unnamed: 0,title,text,label
0,a wild weekend of trump tweets,washington cnn even by his standards president...,1
1,scaramucci suggests trump may veto bipartisan ...,story highlights gop leaders havent commented ...,1
2,top state department official out after 3 months,story highlights maliz beams had been appointe...,1
3,athletes call out trump after national anthem ...,new york cnn president donald trump drew the i...,1
4,chief of staff to congresswoman placed on leav...,story highlights rep brenda lawrence wrote pro...,1


## Split into train and test data sets

In [43]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    data_df['text'].values, 
    data_df['label'].values, 
    random_state = 1
)

print("Training dataset: ", X_train.shape)
print("Test dataset: ", X_test.shape)

Training dataset:  (7608,)
Test dataset:  (2537,)


## Extract features
Apply bag of words processing to the dataset

In [44]:
from sklearn.feature_extraction.text import CountVectorizer

count_vector = CountVectorizer(stop_words = 'english')
training_data = count_vector.fit_transform(X_train)
testing_data = count_vector.transform(X_test)

## Train Multinomial Naive Bayes classifier

In [45]:
from sklearn.naive_bayes import MultinomialNB

naive_bayes = MultinomialNB()
naive_bayes.fit(training_data, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

## Prediction

In [46]:
predictions = naive_bayes.predict(testing_data)

In [47]:
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score

print("Accuracy score: ", accuracy_score(y_test, predictions))
print("Recall score: ", recall_score(y_test, predictions, average = 'weighted'))
print("Precision score: ", precision_score(y_test, predictions, average = 'weighted'))
print("F1 score: ", f1_score(y_test, predictions, average = 'weighted'))

Accuracy score:  0.87780843516
Recall score:  0.87780843516
Precision score:  0.880418028427
F1 score:  0.87835925049
