### By Chenfeng (Aaron) Hao

In this exercise, we are building a naive bayes classifier for tagging documents. The data we use are text documents on 20 topics, consisting of 11292 records for training and 7157 records for testing. The classifier is supposed to learn the conditional probability of what topic a document is on based on which words appear in the training data for a particular topic. We then use the probability learned from the training data to predict the topic of documents in the testing data.

The resulting model shows an overall accuracy of 81%, which is not much different from that of a vanilla sklearn MultinomialNB classifier with little additional processing (79.8% accuracy).

In [150]:
import numpy as np
import pandas as pd

In [151]:
# load train/test data from files
train_file_data = pd.read_fwf("../data/forumTraining.data", delimiter="\n")
test_file_data = pd.read_fwf("../data/forumTest.data", delimiter="\n")

In [152]:
# separate topic from document by adding an extra column
train_data = train_file_data.iloc[:,0].str.split(n=1, expand=True)
test_data = test_file_data.iloc[:,0].str.split(n=1, expand=True)

There are many clean-ups we can perform on the data.

In [153]:
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
from nltk.stem import WordNetLemmatizer
import re
import gensim
import spacy

# get stop words from three sources

stopwords_gensim = gensim.parsing.preprocessing.STOPWORDS

sp = spacy.load('en_core_web_sm')
stopwords_spacy = sp.Defaults.stop_words

stopwords_nltk = stopwords.words("english")

# extend the stop words list
stopwords_nltk.extend(['cannot', 'could', 'done', 'let', 'may' 'mayn',  'might',  'must', 'need', 'ought', 'oughtn', 'shall', 'would', 'br', 'faq', 'alt', 'co', 'uk', 'whilst', 'pgp', 'signed', 'he', 'please', 'edu', 'cs', 'umd', 'et', 'al', 'her'])

stop_words = stopwords_gensim.union(set(stopwords_nltk))

stop_words = stop_words.union(set(stopwords_spacy))

stemmer = SnowballStemmer(language='english')

def preprocess(text):
    """
    This function preprocesses the data for analysis.
    :param text: a piece of text
    :return: processed text
    """
    # split text into words
    # str.split() is faster than nltk tokenize
    # since punctuations are removed already, we don't need tokenize
    word_tokens = text.split()

    # remove stop words
    #filtered_text = [word for word in word_tokens if word not in stop_words]

    # remove single letters/numbers
    #filtered_text = [re.sub(r'\b\w{1,2}\b', '', word) for word in filtered_text]

    # remove empty string resulted from the last step
    #filtered_text = list(filter(None, filtered_text))

    #filtered_text = [stemmer.stem(word) for word in filtered_text]

    #filtered_text = [lemmatizer.lemmatize(word) for word in filtered_text]
    return word_tokens

In [141]:
test_df = test_data
#test_df.columns = ['topic', 'wordlist']
test_df[1] = test_df.apply(lambda row: preprocess(row[1]), axis=1)

In [142]:
train_df = train_data
#train_df.columns = ['topic', 'wordlist']
train_df[1] = train_df.apply(lambda row: preprocess(row[1]), axis=1)

In [143]:
test_df[1][1]

['re',
 'yet',
 'more',
 'rushdie',
 're',
 'islamic',
 'law',
 'jaeger',
 'buphy',
 'bu',
 'edu',
 'gregg',
 'jaeger',
 'writes',
 'in',
 'article',
 'vice',
 'ico',
 'tek',
 'com',
 'bobbe',
 'vice',
 'ico',
 'tek',
 'com',
 'robert',
 'beauchaine',
 'writes',
 'bennett',
 'neil',
 'how',
 'bcci',
 'adapted',
 'the',
 'koran',
 'rules',
 'of',
 'banking',
 'the',
 'times',
 'august',
 'so',
 'let',
 's',
 'see',
 'if',
 'some',
 'guy',
 'writes',
 'a',
 'piece',
 'with',
 'a',
 'title',
 'that',
 'implies',
 'something',
 'is',
 'the',
 'case',
 'then',
 'it',
 'must',
 'be',
 'so',
 'is',
 'that',
 'it',
 'gregg',
 'you',
 'haven',
 't',
 'provided',
 'even',
 'a',
 'title',
 'of',
 'an',
 'article',
 'to',
 'support',
 'your',
 'contention',
 'this',
 'is',
 'how',
 'you',
 'support',
 'a',
 'position',
 'if',
 'you',
 'intend',
 'to',
 'have',
 'anyone',
 'respect',
 'it',
 'gregg',
 'any',
 'questions',
 'and',
 'i',
 'even',
 'managed',
 'to',
 'include',
 'the',
 'above',
 'ref

In [144]:
from collections import Counter

def train_naive_bayes(df):
    vocabulary = set(np.concatenate(df[1]))
    topic_dict = dict(train_df.groupby(df[0]).size().div(len(df)))

    new_df = df.groupby(df[0], as_index = False).agg(sum)

    total_n_of_vocabulary = len(vocabulary)
    topic_list = list(new_df[0])
    word_position = {topic: len(new_df[new_df[0] == topic][1].item()) for topic in topic_list}
    sum_n_len_vocab = {k: v+total_n_of_vocabulary for k,v in word_position.items()}
    vocab_dict = {topic: {} for topic in topic_list}

    for topic in topic_list:
        all_words = {word: 0 for word in vocabulary}
        index = new_df[0] == topic
        vocab_dict[topic] = all_words
        topic_counter = Counter(new_df[index][1].item())
        vocab_dict[topic].update(topic_counter)
        vocab_dict[topic] = {k:(v+1)/sum_n_len_vocab[topic] for k,v in vocab_dict[topic].items()}

    return vocabulary, vocab_dict, topic_list, topic_dict


In [145]:
result = train_naive_bayes(train_df)

In [146]:
vocabulary, vocab_dict, topic_list, topic_dict = result[0], result[1], result[2], result[3]

In [147]:
test_df[1]

0       [re, amusing, atheists, and, agnostics, in, ar...
1       [re, yet, more, rushdie, re, islamic, law, jae...
2       [re, christian, morality, is, in, article, vic...
3       [re, after, years, can, we, say, that, christi...
4       [re, amusing, atheists, and, agnostics, timmba...
                              ...                        
7522    [re, religion, and, marriage, pboxrud, magnus,...
7523    [re, a, message, for, you, mr, president, how,...
7524    [re, why, did, they, behave, as, they, did, wa...
7525    [re, info, about, new, age, in, article, apr, ...
7526    [re, i, ll, see, your, demand, and, raise, you...
Name: 1, Length: 7527, dtype: object

In [148]:
def predict(data, vocabulary, vocab_dict, topic_list, topic_dict):
    prob_dict = {}
    predictions = []
    for i in range(0, len(data)-1):
        wordlist = data[i]
        for topic in topic_list:
            x = np.array([vocab_dict[topic][word] for word in wordlist if word in vocabulary])
            prod_word = np.prod(x, dtype='float128')
            prob_dict[topic] = np.prod([prod_word, topic_dict[topic]], dtype='float128')
            predictions.append(max(prob_dict, key=prob_dict.get))
    return predictions

In [149]:
predictions = predict(test_df[1], vocabulary, vocab_dict, topic_list, topic_dict)

In [86]:
result_df = test_df

In [87]:
result_df['prediction'] = result_df.apply(lambda row: predict(row[1], vocabulary, vocab_dict, topic_list, topic_dict), axis=1)

In [88]:
result_df['correct'] = result_df.apply(lambda row: row.prediction == row[0], axis=1)

In [89]:
accuracy = np.sum(result_df.correct)/len(result_df)
accuracy

0.7989903015809752

In [90]:
def calc_precision(confusion_matrix, topic):
    return confusion_matrix[topic][topic]/confusion_matrix[topic][len(confusion_matrix) - 1]

def calc_recall(confusion_matrix, topic):
    return confusion_matrix[topic][topic]/confusion_matrix.loc[topic][len(confusion_matrix) - 1]

def get_precision_recall(df, topic_list):
    confusion_matrix = pd.crosstab(df[0], df['prediction'], rownames=['Actual'], colnames=['Predicted'], margins=True)
    precision = [calc_precision(confusion_matrix, topic) for topic in topic_list]
    recall = [calc_recall(confusion_matrix, topic) for topic in topic_list]
    pr_df = pd.DataFrame(zip(topic_list, precision, recall), columns=['topic', 'precision', 'recall'])
    return pr_df

In [91]:
get_precision_recall(result_df,topic_list)

Unnamed: 0,topic,precision,recall
0,atheism,0.746032,0.738994
1,autos,0.83105,0.921519
2,baseball,0.975342,0.896725
3,christianity,0.672535,0.959799
4,cryptology,0.745491,0.939394
5,electronics,0.801242,0.656489
6,forsale,0.934363,0.620513
7,graphics,0.728205,0.730077
8,guns,0.679359,0.931319
9,hockey,0.957921,0.969925


Here, I used the MultinomialNB classifier from sklearn as a benchmark.

In [92]:
#train_data = train_file_data.iloc[:,0].str.split(n=1, expand=True)
#test_data = test_file_data.iloc[:,0].str.split(n=1, expand=True)

#from sklearn.feature_extraction.text import CountVectorizer
#from sklearn.naive_bayes import MultinomialNB

#count_vect = CountVectorizer()
#mnb = MultinomialNB()

#X_train = count_vect.fit_transform(train_data[1])

#y_train = np.array(train_data[0])
#X_test = count_vect.transform(test_data[1])
#y_test = np.array(test_data[0])
#clf = MultinomialNB().fit(X_train, y_train)

#y_pred = clf.predict(X_test)

#from sklearn import metrics
#print("Accuracy:",metrics.accuracy_score(y_test, y_pred))

#from sklearn.metrics import confusion_matrix
#conf_mat = confusion_matrix(y_test, y_pred)
#conf_mat

In [93]:
#from sklearn.metrics import precision_recall_fscore_support
#precision_recall_fscore_support(y_test, y_pred, average=None)
