## Import some useful libraries

In [8]:
import numpy as np
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.metrics import accuracy_score # for evaluating results
from scipy.sparse import coo_matrix # for sparse matrix

# data path and file name 
path = 'ex6DataPrepared/'
train_data_path = 'train-features.txt'
test_data_path = 'test-features.txt'
train_label_path = 'train-labels.txt'
test_label_path = 'test-labels.txt'

## Extract data

In [9]:
nwords = 2500 

def read_data(data_path, label_path):
    ## read label
    with open(path + label_path) as f:
        content = f.readlines()
    label = [int(x.strip()) for x in content]

    ## read data
    with open(path + data_path) as f:
        content = f.readlines()
    # remove '\n' at the end of each line
    content = [x.strip() for x in content] 

    dat = np.zeros((len(content), 3), dtype = int)
    
    for i, line in enumerate(content): 
        a = line.split(' ')
        dat[i, :] = np.array([int(a[0]), int(a[1]), int(a[2])])
    
    # remember to -1 at coordinate since we're in Python
    # check this: https://docs.scipy.org/doc/scipy/reference/generated/scipy.sparse.coo_matrix.html
    # for more information about coo_matrix function 
    data = coo_matrix((dat[:, 2], (dat[:, 0] - 1, dat[:, 1] - 1)), shape=(len(label), nwords))
    return (data, label)

In [10]:
(train_data, train_label)  = read_data(train_data_path, train_label_path)
(test_data, test_label)  = read_data(test_data_path, test_label_path)

## Multinomial Naive Bayes

In [25]:
multiNB_clf = MultinomialNB()
multiNB_clf.fit(train_data, train_label)

y_pred = multiNB_clf.predict(test_data)
multiNB_accuracy = accuracy_score(test_label, y_pred)*100
print('MultinomialNB test accuracy: %.2f%%.' %multiNB_accuracy)

MultinomialNB test accuracy: 98.08%.


## Bernoulli Naive Bayes

In [36]:
binNB_clf = BernoulliNB(binarize = 0.5)
binNB_clf.fit(train_data, train_label)

y_pred = binNB_clf.predict(test_data)
binNB_accuracy = accuracy_score(test_label, y_pred)*100
print('BernoulliNB test accuracy: %.2f%%.' %binNB_accuracy)

BernoulliNB test accuracy: 85.38%.
