In [18]:
# importing libraries
import pandas as pd
import numpy as np
from collections import Counter

In [19]:
# reading the features of the training dataset
x_train = pd.read_csv('example_data/train-features.txt', sep = " ", header = None)
x_train = x_train.to_numpy()
# reading the label of the training dataset
y_train = pd.read_csv('example_data/train-labels.txt', sep = '\n', header = None)
y_train = y_train[0].values.tolist()
# reading the features of the test dataset
x_test = pd.read_csv('example_data/test-features.txt', sep = " ", header = None)
x_test = x_test.to_numpy()
# reading the labels of the test dataset
y_test = pd.read_csv('example_data/test-labels.txt', sep = '\n', header = None)
y_test = y_test[0].values.tolist()

# checking dimensions of the dataset
print("dimensions of the training features dataset:", x_train.shape)
print("no. of emails in the training features = no. of emails in the training labels?", len(y_train) == x_train.shape[0])
print("dimensions of the test features dataset:", x_test.shape)
print("no. of emails in the test features = no. of emails in the test labels?", len(y_test) == x_test.shape[0])

# checking the number of classes in the dataset and whether they are imbalanced
print("classes in the training dataset and their counts:", dict(Counter(y_train)))
print("classes in the training dataset and their counts:", dict(Counter(y_test)))

dimensions of the training features dataset: (700, 2500)
no. of emails in the training features = no. of emails in the training labels? True
dimensions of the test features dataset: (260, 2500)
no. of emails in the test features = no. of emails in the test labels? True
classes in the training dataset and their counts: {0: 350, 1: 350}
classes in the training dataset and their counts: {0: 130, 1: 130}


In [20]:
# learning the weights of the features
def calc_cond_probs(x_train, dirichlet_alpha):
    features_y0 = []
    features_y1 = []
    
    # calculating the total number of words found in each class across all features
    vocab_y0 = np.sum(x_train[:350,:])
    vocab_y1 = np.sum(x_train[350:,:])

    # calculating conditional probabilities for each feature for each class assuming a Dirichlet prior for smoothing
    for i in range(x_train.shape[1]):
            temp_y0_feat = (np.sum(x_train[:350, i]) + dirichlet_alpha)/(vocab_y0 + dirichlet_alpha*x_train.shape[1])
            features_y0.append(temp_y0_feat)

            temp_y1_feat = (np.sum(x_train[350:, i]) + dirichlet_alpha)/(vocab_y1 + dirichlet_alpha*x_train.shape[1])
            features_y1.append(temp_y1_feat)

    return np.array(features_y0), np.array(features_y1)

# the the hyperparameter dirichlet_alpha was set to be 1
# dirichlet_alpha is not yet optimized
w_y0, w_y1 = calc_cond_probs(x_train, dirichlet_alpha=1)

print("feature weights for class 0:", w_y0[:5])
print("feature weights for class 1:", w_y1[:5])

[0.00501152 0.00393762 0.00191434 0.01760256 0.00091826]
[0.01468118 0.01015245 0.01233177 0.00018072 0.0112049 ]


In [21]:
def calc_log_likelihood(f_y0, f_y1, x):

    log_likelihood_y0 = []
    log_likelihood_y1 = []

    # calculating log likelihoods using weights of features for the dataset of interest 
    for i in range(x.shape[0]):
        log_likelihood_y0.append(sum(np.log(f_y0)*x[i, :]))
        log_likelihood_y1.append(sum(np.log(f_y1)*x[i, :]))

    return log_likelihood_y0, log_likelihood_y1

# calculating log likelikhoods for the training dataset
log_w_y0, log_w_y1 = calc_log_likelihood(w_y0, w_y1, x_train)

print("log likelihoods of observations for class 0:", log_w_y0[:5])
print("log likelihoods of observations for class 1:", log_w_y1[:5])

[-145.7770372776866, -137.42774744458674, -975.8265938223943, -435.5389319660713, -271.8035324777869]
[-168.1442994983499, -183.7289678737881, -1163.7451814682588, -521.9402014440582, -314.335999938414]


In [23]:
def calc_posterior(logLH_y0, logLH_y1, y):
    # calculating priors for classes
    log_prior_y0 = np.log(y.count(0)/len(y))
    log_prior_y1 = np.log(y.count(1)/len(y))

    # calculating posteriors for classes for each observation
    posterior_y0 = logLH_y0 + log_prior_y0
    posterior_y1 = logLH_y1 + log_prior_y1

    return posterior_y0, posterior_y1

# calculating posteriors for the training dataset
posterior_y0, posterior_y1 = calc_posterior(log_w_y0, log_w_y1, y_train)

print("posterior probabilities of observation for class 0:", posterior_y0[:5])
print("posterior probabilities of observation for class 1:", posterior_y1[:5])

[-146.47018446 -138.12089463 -976.519741   -436.23207915 -272.49667966]
[ -168.83744668  -184.42211505 -1164.43832865  -522.63334862
  -315.02914712]


In [29]:
def classify(p_y0, p_y1, y):
    
    # predicted class labels for the dataset of interest
    class_pred = [0 if p_y0[i] > p_y1[i] or p_y0[i] == p_y1[i] else 1 for i in range(len(y))]

    # calculating true positives, false positives, true negatives, and false negatives
    true_pos = len([i for i in range(len(y)) if y[i] == 0 and class_pred[i] == 0])
    true_neg = len([i for i in range(len(y)) if y[i] == 1 and class_pred[i] == 1])
    false_pos = len([i for i in range(len(y)) if y[i] == 0 and class_pred[i] == 1])
    false_neg = len([i for i in range(len(y)) if y[i] == 1 and class_pred[i] == 0])
    
    # constructing a confusion matrix
    confusion_matrix = np.array([[true_pos, false_pos],[false_neg, true_neg]])
    # calculating accuracy of the model
    accuracy = (true_pos + true_neg)/len(y)

    return class_pred, accuracy, confusion_matrix

# predicting the training dataset labels and calculating training accuracy
y_predicted, model_accuracy, model_confusion = classify(posterior_y0, posterior_y1, y_train)

print("Model accuracy is {:0.2f}%".format(model_accuracy*100))
print("Training confusion matrix:")
print(model_confusion)

Model accuracy is 98.71%
confusion matrix:
[[349   1]
 [  8 342]]


In [31]:
# combining all of the above functions to train and test the multinomial naive bayes' model
def NB_classification(x_train, x_test, y_train, y_test, dirichlet_alpha):
    
    # training the model
    f_y0, f_y1 = calc_cond_probs(x_train, dirichlet_alpha)

    # class prediction on training dataset
    logLH_train_y0, logLH_train_y1 = calc_log_likelihood(f_y0, f_y1, x_train)
    posterior_train_y0, posterior_train_y1 = calc_posterior(logLH_train_y0, logLH_train_y1, y_train)
    train_class_pred, training_accuracy, train_confusion = classify(posterior_train_y0, posterior_train_y1, y_train)

    # class prediction on test dataset
    logLH_test_y0, logLH_test_y1 = calc_log_likelihood(f_y0, f_y1, x_test)
    posterior_test_y0, posterior_test_y1 = calc_posterior(logLH_test_y0, logLH_test_y1, y_train)
    test_class_pred, test_accuracy, test_confusion = classify(posterior_test_y0, posterior_test_y1, y_test)
    
    # printing out prediction accuracy on training and test datasets
    print("Training classification accuracy: {:0.2f}%".format(training_accuracy*100))
    print("Test classification accuracy: {:0.2f}%".format(test_accuracy*100))

    return test_class_pred, test_confusion

y_test_pred, test_confusion_matrix = NB_classification(x_train, x_test, y_train, y_test, dirichlet_alpha=1)
print("Test confusion matrix:")
print(test_confusion_matrix)

Training classification accuracy: 98.71%
Test classification accuracy: 97.31%
Test confusion matrix:
[[127   3]
 [  4 126]]
