### Problem 4

Implementation of Multinomial Model (Naive Bayes Document Classification)

In [2]:
'''
Created on Sep 29, 2019

This is the solution to Problem 4 of the assignment 1 of 
    DATA MINING course, CSC 503

@author: Jian Wang (V00935999)
'''
import pandas as pd
import numpy as np

'''------------------------------
#
# Preprocess 1: Read data from 2 pairs of text file (train_data.txt, test_data.txt)
#
---------------------------------'''
def read_data_from_files():
    traindata_file = open('traindata.txt')
    trainlabel_file = open('trainlabels.txt')
    testdata_file = open('testdata.txt')
    testlabel_file = open('testlabels.txt')
    
    train_data = []
    train_label = []
    test_data = []
    test_label = []
    
    for line in traindata_file:
        line = line.replace('\n', '')
        train_data.append(line)
        
    for line in trainlabel_file:
        line = line.replace('\n', '')
        train_label.append(int(line))
    
    for line in testdata_file:
        line = line.replace('\n', '')
        test_data.append(line)
    
    for line in testlabel_file:
        line = line.replace('\n', '')
        test_label.append(int(line))
        
    return (train_data, train_label, test_data, test_label)

'''------------------------------
#
# Preprocess 2: Extract train vocabulary
#
---------------------------------'''
def extract_vocab(train_data):
    
    vocabulary = []
    
    for line in train_data:
        line = line.replace('\n', '')
        line = line.split(' ')
        for word in line:
            if word not in vocabulary and len(word) > 0:
                vocabulary.append(word)
    vocabulary.sort()
    return vocabulary

'''------------------------------
#
# Preprocess 3: Convert data/label into DataFrame under column of vocabulary 
#
#        Convert array[1 dimensional] to dataframe[2 dimensional table]
#        Column: vocab, Row: doc_index, Cell: vocab count in a document
#
---------------------------------'''
def preprocess(vocabulary, train_data, train_label, test_data, test_label):
    
    train_x = np.zeros((len(train_data), len(vocabulary)))
    test_x = np.zeros((len(test_data), len(vocabulary)))
    
    train_count = 0
    
    for line in train_data:
        line = line.replace('\n', '')
        line = line.split(' ')
        for word in line:
            if word in vocabulary:
                index = vocabulary.index(word)
                train_x[train_count][index] += 1
                
        train_count += 1
        
    test_count = 0
    
    for line in test_data:
        line = line.replace('\n', '')
        line = line.split(' ')
        for word in line:
            if word in vocabulary:
                index = vocabulary.index(word)
                test_x[test_count][index] += 1
                
        test_count += 1
    
    train_y = list(map(int, train_label))
    test_y = list(map(int, test_label))
    
    train_x = pd.DataFrame(train_x, columns = vocabulary)
    test_x = pd.DataFrame(test_x, columns = vocabulary)
    
    train_y = pd.DataFrame(train_y, columns = ['label'])
    test_y =  pd.DataFrame(test_y, columns = ['label'])
    
    return (train_x, train_y, test_x, test_y)

'''------------------------------
#
# Main: Entry of classifier
#
---------------------------------'''
def NaiveBayesTextClassification():
    
    '''---------------------------------
        Part 1: Preprocess
    '''
    (train_docs, train_class, test_docs, test_class) = read_data_from_files()
    
    vocabulary = extract_vocab(train_docs)
    
    (train_DOCS, train_CLASS,
     test_DOCS, test_CLASS) = preprocess(vocabulary, 
                                     train_docs, train_class, 
                                     test_docs, test_class)
    
    '''---------------------------------
        Part 2: Train the classifier with train data
    '''
    (prior, condprob, classlist) = TrainMultinomialNB(train_CLASS, 
                                   train_DOCS, vocabulary)
    
    '''---------------------------------
        Part 3: Run with the test data 1
    '''
    result = ApplyMultinomialNB_OnDocs(classlist, vocabulary, prior, condprob, train_docs)
    accuracy = calculate_accuracy(result, train_class)
    print("Test on traindata.txt/trainlabels.txt, \n\t accuracy=", accuracy, "%\n")
    
    #
    #    Run with the test data 2
    #
    result = ApplyMultinomialNB_OnDocs(classlist, vocabulary, prior, condprob, test_docs)
    accuracy = calculate_accuracy(result, test_class)
    print("Test on testdata.txt/testlabels.txt, \n\t accuracy=", accuracy, "%")

#
# Main, utility function: calculate the accuracy 
#
def calculate_accuracy(result_label, test_label):
        rst = np.asarray(result_label)
        tst = np.asarray(test_label)

        count = np.equal(rst, tst)
        value, count = np.unique(count, return_counts = True)
        val_count = dict(zip(value, count))
        
        try :
            accuracy = 1 - (val_count[False] / rst.shape[0])
        except KeyError:
            accuracy = 1
        
        return round(accuracy*100, 2)


'''------------------------------
#
# Body 1: Train the classifier,
#
#        To refer to Pseudocode of the algorithm for more info about
#        the calculation of prior, condprob, which on Slide 14(Naive Bayes for Text Classification)
#        of Course Presentation
#
#        @CLASS_DF: Class in form of DataFrame ()
---------------------------------'''
def TrainMultinomialNB(CLASS_DF, DOCS, VOCAB):
    
    condprob = {}
    prior = {}
    countdocs = DOCS.shape[0]  #len(DOCS)
    
    clsval = CLASS_DF.columns.values
    clslist, clscount = np.unique(CLASS_DF[clsval], return_counts = True)
    count_DocInClass = dict(zip(clslist, clscount))
    for clsIdx, docCnt in count_DocInClass.items():
        prior[clsIdx] = count_DocInClass[clsIdx] / countdocs
        
    clscount = len(clscount)
    vocabcount = len(VOCAB)
    docscount = len(DOCS)
    
    #conditional probability
    condprob = np.zeros((clscount, vocabcount))
    
    #occurrence of a token in a term
    Token_CT = np.zeros((clscount, vocabcount))
    
    
    #Traverse all documents, Rows of Docs
    for i in range(docscount):
        cls_idx = CLASS_DF.iloc[i, 0]
        for k in range(vocabcount):
            # CountTokenOfTerm(text_class, token)
            Token_CT[cls_idx, k] += DOCS.iloc[i,k]
    
    sum_Token_CT = np.zeros(clscount)
    for cls in range(clscount) : 
        for k in range(vocabcount):
            sum_Token_CT[cls] += Token_CT[cls, k]

    for cls in range(clscount) : 
        for k in range(vocabcount):
            # get conditional probability
            condprob[cls, k] = ((Token_CT[cls,k] + 1)
                    / (sum_Token_CT[cls] + Token_CT[cls,k] + 1))
    
    return (prior, condprob, clslist)

'''------------------------------
#
# Body 2: Apply classification on a group of documents
#
---------------------------------'''
def ApplyMultinomialNB_OnDocs(classlist, VOCAB, prior, condprob, docs):
    classify_result = []
    
    for doc in docs:
        classify_result.append(
            ApplyMultinomialNB(classlist, VOCAB, prior, condprob, doc))
    
    return classify_result

#
# Body 2, utility function: Apply classifier on a document
#
#        To refer to Pseudocode of the algorithm for more info about
#        the calculation of score
#
def ApplyMultinomialNB(classlist, VOCAB, prior, condprob, doc):
    
    w_vocab = ExtractTokenFromDoc(VOCAB, doc)
    
    score = []
    for c in classlist:
        score.append(1 * np.log2(prior[c]))
        
        for t in w_vocab:
            # 
            idx = VOCAB.index(t)
            score[c] += 1 * np.log2(condprob[c, idx])
            
    return np.argmax(score)

# 
# Body 2, utility function: extract token from the test document
#
def ExtractTokenFromDoc(train_VOCAB, test_doc):
    w_vocab = []

    
    test_doc = test_doc.replace('\n', '')
    line = test_doc.split(' ')
    for word in line:
        if word in train_VOCAB and len(word) > 0 and word not in w_vocab:
            w_vocab.append(word)
            
    w_vocab.sort()
    
    return w_vocab

#def (documents):
#    vocab = preprocess_vocab(documents)
#    return vocab

if __name__ == '__main__':
    NaiveBayesTextClassification()



Test on traindata.txt/trainlabels.txt, 
	 accuracy= 96.58 %

Test on testdata.txt/testlabels.txt, 
	 accuracy= 80.2 %
