In [2]:
import time
import csv
import re
import numpy as np
import pandas as pd
from nltk.corpus import stopwords
from sklearn.metrics import accuracy_score

TRAINING_NEWS_FILE = 'bbc_news_train.csv'
TEST_FILE = 'bbc_news_test.csv'
NON_WORD_CHAR = re.compile(r'\W+')

def get_csv_data(path):
    data_list = []
    with open(path) as csvfile:
        csvreader = csv.reader(csvfile)
        for row in csvreader:
            data_list.append((tokenize(row[0]), row[1]))
    return data_list

def tokenize(string): 
    true_tokens = []
    stop_words = set(stopwords.words('english'))
    possible_tokens = [NON_WORD_CHAR.sub('', word) for word in string.lower().strip().split() if word not in stop_words]
    for possible in possible_tokens:
        if len(possible) > 0:
            true_tokens.append(possible)
    return true_tokens

def get_feature_words(training_set, size = 1000):#build the vocabulary to count word frequencies that are crucial for building vectors used to calculate probabilities
    feature_words_dict = {}
    for element in training_set:
        for word in element[0]:
            if word in feature_words_dict:
                feature_words_dict[word] += 1
            else:
                feature_words_dict[word] = 1
    feature_words_tuple = sorted(feature_words_dict.items(), key=lambda x:x[1], reverse=True)
    feature_words = list(list(zip(*feature_words_tuple))[0])
    return feature_words[:size] if len(feature_words) > size else feature_words#only extract the most frequent words

def get_category_list(training_set):
    local_category_list = set()
    for row in training_set:
        local_category_list.add(row[1])
    category_list = list(local_category_list)
    category_list.sort()
    return category_list

def get_probability(training_set, feature_words):# to obtain the prior probabilities as well as the class probabiliy matrix
    class_list = get_category_list(train_data)
    prob_matrix = pd.DataFrame(index=feature_words, columns=class_list)
    word_dict = {}
    num_of_all_news = 0
    prob_classes = {}
    features_nums = len(feature_words)
    for cls in class_list:
        prob_classes[cls] = 0
        word_dict[cls] = []
    
    for element in training_set:
        num_of_all_news += 1
        prob_classes[element[1]] += 1
        for word in element[0]:
            word_dict[element[1]].append(word)
    
    for cls in class_list:
        prob_count = {}
        news_nums = prob_classes[cls]
        for word in feature_words:
            prob_count[word] = 1
        
        word_list = word_dict[cls]
        for word in prob_count.keys():
            if word in word_list:
                prob_count[word] += word_list.count(word)
        for word in prob_count.keys():
            prob_matrix.loc[word, cls] = prob_count[word] / (news_nums * features_nums + features_nums)
    
    for cls in prob_classes.keys():
        prob_classes[cls] = prob_classes[cls] / num_of_all_news
    return prob_matrix, prob_classes#the matrix of word probabilities for each class, and the dictionary of prior probabilities of each class 

def predict_with_content(prob_matrix, prob_classes, feature_words, content):
    result = {}
    for cls in prob_classes.keys():
        result[cls] = np.log(prob_classes[cls])
    for cls in result.keys():
        for word in feature_words:
            if word in content:
                result[cls] += np.log(prob_matrix.loc[word, cls] * content.count(word))
            else:
                result[cls] += np.log(1 - prob_matrix.loc[word, cls])#when there are new words not in the feature word dic, it means this news is likely not to belong to the class, so the probability should be subtracted
    return max(result, key = result.get)

def score(testing_set, prob_matrix, prob_classes, feature_words):
    y_true = []
    y_predict = []
    for element in testing_set:
        y_true.append(element[1])
        y_predict.append(predict_with_content(prob_matrix, prob_classes, feature_words, element[0]))
    return accuracy_score(y_true, y_predict)

if __name__ == "__main__":

    print("Multinomial NaiveBayes Start!")

    start_time = time.time()
    train_data = get_csv_data(TRAINING_NEWS_FILE)
    test_data = get_csv_data(TEST_FILE)
    feature_words = get_feature_words(train_data, size=1000)
    print("Loading Time: %ss" % str(time.time()-start_time))

    start_time = time.time()
    prob_matrix, prob_classes = get_probability(train_data, feature_words)
    print("Traning Time: %ss" % str(time.time()-start_time))
    
    start_time = time.time()
    acc = score(test_data, prob_matrix, prob_classes, feature_words)
    print("Testing Time: %ss" % str(time.time()-start_time))

    print("Accuracy Rate:%s" % acc)

    print("Multinomial NaiveBayes End!")


Multinomial NaiveBayes Start!
Loading Time: 0.6359918117523193s
Traning Time: 2.945294141769409s
Testing Time: 59.50153374671936s
Accuracy Rate:0.0
Multinomial NaiveBayes End!
