In [1]:
import json
import pandas as pd
import numpy as np
import os

## Load data


In [2]:
data_dir = "../data/"
labels = ['politics','sport','health','money','local']

if os.path.exists("../data/data_df.csv"):
    data_df = pd.DataFrame.from_csv("../data/data_df.csv")
else:
    data = []
    for l in labels:
        for name in os.listdir(data_dir + l):
            article = json.load(open(data_dir + l + "/" + name))
            text = article['text']
            if text.endswith("Read More"):
                text = text[:-len("Read More")]
            art_list = [article['title'],text,l]
            data.append(art_list)
    data = np.array(data)
    data_df = pd.DataFrame(data,columns=['title','text','label'])
    data_df.to_csv("../data/data_df.csv")

## Preprocess data
1.  categories into discrete numerical values;
2.  Transform all words to lowercase;
3.  Remove all punctuations and stopwords.
4.  Tokenization and stemming the tokens.
5.  Replaced numerical values with '#num#' to reduce vocabulary size.

In [None]:
import string
from nltk.stem.porter import *
import nltk 
import re

if (!os.path.exists("../data/data_df_processed.csv"):
    data_df_processed = data_df.copy()
    data_df_processed['label'] = data_df.label.map({ 'local':0, 'politics': 1, 'sport': 2, 'health': 3, 'money': 4})
    data_df_processed['title'] = data_df.title.map(
        lambda x: x.lower().translate(str.maketrans('','', string.punctuation))
    )
    data_df_processed['text'] = data_df.text.map(
        lambda x: x.lower().translate(str.maketrans('','', string.punctuation))
    )




    stemmer = PorterStemmer()

    data_df_processed['title_stem'] = data_df_processed.title.map(
        lambda x: ' '.join([stemmer.stem(w) for w in nltk.word_tokenize(x) if w not in stopwords.words('english')]))

    data_df_processed['text_stem'] = data_df_processed.text.map(
        lambda x: ' '.join([stemmer.stem(w) for w in nltk.word_tokenize(x) if w not in stopwords.words('english')]))

    regex = re.compile(r"\d+", re.IGNORECASE)

    data_df_processed['title_stem'] = data_df_processed.title_stem.map(
        lambda x: regex.sub("spnumsp", x))

    data_df_processed['text_stem'] = data_df_processed.text_stem.map(
        lambda x: regex.sub("spnumsp", x))
    
    data_df_processed['title+text_stem'] = data_df_processed['text_stem'].values + data_df_processed['title_stem'].values
    
    data_df_processed['title+text'] = data_df_processed['text'].values + data_df_processed['title'].values
    
    data_df_processed.to_csv("../data/data_df_processed.csv")
else:
    data_df_processed = pd.DataFrame.from_csv("../data/data_df_processed.csv")

In [36]:
data_df_processed['title_nonum'] = data_df_processed.title.map(
    lambda x: regex.sub("spnumsp", x))

data_df_processed['text_nonum'] = data_df_processed.text.map(
    lambda x: regex.sub("spnumsp", x))

## Split into train and test data sets

In [48]:
from sklearn.model_selection import train_test_split
#     data_df_processed['text_nonum'].values + data_df_processed['title_nonum'].values , 
X_train, X_test, y_train, y_test = train_test_split(
    data_df_processed['title+text'].values,
    data_df_processed['label'].values, 
    random_state = 1
)

print("Training dataset: ", X_train.shape)
print("Test dataset: ", X_test.shape)

Training dataset:  (7608,)
Test dataset:  (2537,)


## Extract features
Apply bag of words processing to the dataset

In [83]:
from sklearn.feature_extraction.text import CountVectorizer

count_vector = CountVectorizer(stop_words = 'english', ngram_range=(1, 4),  max_features=None, min_df=2)
# count_vector = CountVectorizer(stop_words = 'english')
training_data = count_vector.fit_transform(X_train)
testing_data = count_vector.transform(X_test)

## Train Multinomial Naive Bayes classifier

In [84]:
from sklearn.naive_bayes import MultinomialNB

naive_bayes = MultinomialNB()
naive_bayes.fit(training_data, y_train)


MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

## Prediction

In [85]:
predictions = naive_bayes.predict(testing_data)
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score

print("Accuracy score: ", accuracy_score(y_test, predictions))
print("Recall score: ", recall_score(y_test, predictions, average = 'weighted'))
print("Precision score: ", precision_score(y_test, predictions, average = 'weighted'))
print("F1 score: ", f1_score(y_test, predictions, average = 'weighted'))

Accuracy score:  0.90185258179
Recall score:  0.90185258179
Precision score:  0.902260376079
F1 score:  0.901933255216


In [82]:
# no stem, text+title, 0.880961765865
# no stem no number , text+title, 0.881355932203
# stem, text+title, 0.871501773749

# no stem no number , text+title, ngram_range=(1, 2),  max_features=None, min_df = 5, 0.885691761924
# no stem, text+title, ngram_range=(1, 2),  max_features=None, min_df = 5, 0.889239258967
# no stem, text+title, ngram_range=(1, 2),  max_features=None, min_df = 10, 0.878990934174
# no stem, text+title, ngram_range=(1, 2),  max_features=None, min_df = 8, 0.884509262909
# no stem, text+title, ngram_range=(1, 2),  max_features=None, min_df = 3, 0.896728419393
# no stem, text+title, ngram_range=(1, 2),  max_features=None, min_df = 2, 0.899881750099
# no stem, text+title, ngram_range=(1, 2),  max_features=None, min_df = 1, 0.882538431218
# no stem, text+title, ngram_range=(1, 2),  max_features=None, min_df = 4, 0.891604256996
# no stem, text+title, ngram_range=(1, 3),  max_features=None, min_df = 2, 0.900670082775
# no stem, text+title, ngram_range=(1, 4),  max_features=None, min_df = 2, 0.90185258179
 