In [1]:
import json
import pandas as pd
import numpy as np
import os

## Load data


In [2]:
data_dir = "../data/"
labels = ['politics','sport','health','money','local']

if os.path.exists("../data/data_df.csv"):
    data_df = pd.DataFrame.from_csv("../data/data_df.csv")
else:
    data = []
    for l in labels:
        for name in os.listdir(data_dir + l):
            article = json.load(open(data_dir + l + "/" + name))
            text = article['text']
            if text.endswith("Read More"):
                text = text[:-len("Read More")]
            art_list = [article['title'],text,l]
            data.append(art_list)
    data = np.array(data)
    data_df = pd.DataFrame(data,columns=['title','text','label'])
    data_df.to_csv("../data/data_df.csv")

## Preprocess data
1.  categories into discrete numerical values;
2.  Transform all words to lowercase;
3.  Remove all punctuations and stopwords.
4.  Tokenization and stemming lemmatization the tokens.
5.  Replaced numerical values with '#num#' to reduce vocabulary size.

In [11]:
import string
from nltk.stem.porter import *
import nltk 
import re
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

if (os.path.exists("../data/data_df_processed.csv") is not True):
    data_df_processed = data_df.copy()
    data_df_processed['label'] = data_df.label.map({ 'local':0, 'politics': 1, 'sport': 2, 'health': 3, 'money': 4})
    data_df_processed['title'] = data_df.title.map(
        lambda x: x.lower().translate(str.maketrans('','', string.punctuation))
    )
    data_df_processed['text'] = data_df.text.map(
        lambda x: x.lower().translate(str.maketrans('','', string.punctuation))
    )


    stemmer = PorterStemmer()

    data_df_processed['title_stem'] = data_df_processed.title.map(
        lambda x: ' '.join([stemmer.stem(w) for w in nltk.word_tokenize(x) if w not in stopwords.words('english')]))

    data_df_processed['text_stem'] = data_df_processed.text.map(
        lambda x: ' '.join([stemmer.stem(w) for w in nltk.word_tokenize(x) if w not in stopwords.words('english')]))

    regex = re.compile(r"\d+", re.IGNORECASE)

    data_df_processed['title_stem'] = data_df_processed.title_stem.map(
        lambda x: regex.sub("spnumsp", x))

    data_df_processed['text_stem'] = data_df_processed.text_stem.map(
        lambda x: regex.sub("spnumsp", x))
    
    data_df_processed['title+text_stem'] = data_df_processed['text_stem'].values + data_df_processed['title_stem'].values
    
    data_df_processed['title+text'] = data_df_processed['text'].values + data_df_processed['title'].values
    
    data_df_processed['title_nonum'] = data_df_processed.title.map(
        lambda x: regex.sub("spnumsp", x))

    data_df_processed['text_nonum'] = data_df_processed.text.map(
        lambda x: regex.sub("spnumsp", x))
    
    lemmer = WordNetLemmatizer()
    data_df_processed['title_lem'] = data_df_processed.title.map(
        lambda x: ' '.join([lemmer.lemmatize(w) for w in nltk.word_tokenize(x) if w not in stopwords.words('english')]))


    data_df_processed['text_lem'] = data_df_processed.text.map(
        lambda x: ' '.join([lemmer.lemmatize(w) for w in nltk.word_tokenize(x) if w not in stopwords.words('english')]))

    data_df_processed['title+text_lem'] = data_df_processed['text_lem'].values + data_df_processed['title_lem'].values
    data_df_processed.to_csv("../data/data_df_processed.csv")
else:
    data_df_processed = pd.DataFrame.from_csv("../data/data_df_processed.csv")

In [25]:
data_df_processed.to_csv("../data/data_df_processed.csv")

## Split into train and test data sets

In [35]:
from sklearn.model_selection import train_test_split
#     data_df_processed['text_nonum'].values + data_df_processed['title_nonum'].values , 
X_train, X_test, y_train, y_test = train_test_split(
    data_df_processed['text'].values,
    data_df_processed['label'].values, 
    random_state = 1
)

print("Training dataset: ", X_train.shape)
print("Test dataset: ", X_test.shape)

Training dataset:  (7608,)
Test dataset:  (2537,)


## Extract features
Apply bag of words processing to the dataset

In [36]:
from sklearn.feature_extraction.text import CountVectorizer

count_vector = CountVectorizer(stop_words = 'english', ngram_range=(1, 4),  max_features=None, min_df=2)
# count_vector = CountVectorizer(stop_words = 'english')
training_data = count_vector.fit_transform(X_train)
testing_data = count_vector.transform(X_test)

## Train Multinomial Naive Bayes classifier

In [37]:
from sklearn.naive_bayes import MultinomialNB

naive_bayes = MultinomialNB()
naive_bayes.fit(training_data, y_train)


MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

## Prediction

In [38]:
predictions = naive_bayes.predict(testing_data)
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score

print("Accuracy score: ", accuracy_score(y_test, predictions))
print("Recall score: ", recall_score(y_test, predictions, average = 'weighted'))
print("Precision score: ", precision_score(y_test, predictions, average = 'weighted'))
print("F1 score: ", f1_score(y_test, predictions, average = 'weighted'))

Accuracy score:  0.89948758376
Recall score:  0.89948758376
Precision score:  0.899908688368
F1 score:  0.8995475469


In [None]:
# no stem, text+title, 0.880961765865
# no stem no number , text+title, 0.881355932203
# stem, text+title, 0.871501773749
# title_lem, ngram_range=(1, 4),  max_features=None, min_df = 2, 0.759952700039
# title, ngram_range=(1, 4),  max_features=None, min_df = 2, 0.745368545526

# text+title, ngram_range=(1, 2),  max_features=None, min_df = 5, 0.885691761924
# text+title, ngram_range=(1, 2),  max_features=None, min_df = 5, 0.889239258967
# text+title, ngram_range=(1, 2),  max_features=None, min_df = 10, 0.878990934174
# text+title, ngram_range=(1, 2),  max_features=None, min_df = 8, 0.884509262909
# text+title, ngram_range=(1, 2),  max_features=None, min_df = 3, 0.896728419393
# text+title, ngram_range=(1, 2),  max_features=None, min_df = 2, 0.899881750099
# text+title, ngram_range=(1, 2),  max_features=None, min_df = 1, 0.882538431218
# text+title, ngram_range=(1, 2),  max_features=None, min_df = 4, 0.891604256996
# text+title, ngram_range=(1, 3),  max_features=None, min_df = 2, 0.900670082775
# text+title, ngram_range=(1, 4),  max_features=None, min_df = 2, 0.90185258179
# text+title_lem, ngram_range=(1, 4),  max_features=None, min_df = 2, 0.901064249113
# text_lem, ngram_range=(1, 4),  max_features=None, min_df = 2, 0.898305084746
# text, ngram_range=(1, 4),  max_features=None, min_df = 2, 0.89948758376
 

In [34]:
data_df_processed.head()

Unnamed: 0,title,text,label,title_stem,text_stem,title+text_stem,title+text,title_lem,text_lem,title+text_lem
0,a wild weekend of trump tweets,washington cnn even by his standards president...,1,wild weekend trump tweet,washington cnn even standard presid donald tru...,washington cnn even standard presid donald tru...,washington cnn even by his standards president...,wild weekend trump tweet,washington cnn even standard president donald ...,washington cnn even standard president donald ...
1,scaramucci suggests trump may veto bipartisan ...,story highlights gop leaders havent commented ...,1,scaramucci suggest trump may veto bipartisan r...,stori highlight gop leader havent comment pote...,stori highlight gop leader havent comment pote...,story highlights gop leaders havent commented ...,scaramucci suggests trump may veto bipartisan ...,story highlight gop leader havent commented po...,story highlight gop leader havent commented po...
2,top state department official out after 3 months,story highlights maliz beams had been appointe...,1,top state depart offici spnumsp month,stori highlight maliz beam appoint counselor s...,stori highlight maliz beam appoint counselor s...,story highlights maliz beams had been appointe...,top state department official 3 month,story highlight maliz beam appointed counselor...,story highlight maliz beam appointed counselor...
3,athletes call out trump after national anthem ...,new york cnn president donald trump drew the i...,1,athlet call trump nation anthem curri remark,new york cnn presid donald trump drew ire prof...,new york cnn presid donald trump drew ire prof...,new york cnn president donald trump drew the i...,athlete call trump national anthem curry remark,new york cnn president donald trump drew ire p...,new york cnn president donald trump drew ire p...
4,chief of staff to congresswoman placed on leav...,story highlights rep brenda lawrence wrote pro...,1,chief staff congresswoman place leav follow se...,stori highlight rep brenda lawrenc wrote propo...,stori highlight rep brenda lawrenc wrote propo...,story highlights rep brenda lawrence wrote pro...,chief staff congresswoman placed leave followi...,story highlight rep brenda lawrence wrote prop...,story highlight rep brenda lawrence wrote prop...
