# Implementing Multinomial Naive Bayes

In [1]:
#importing all the necessary modules
import numpy as np
import pandas as pd
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score

In [2]:
#importing the training and testing data
training_data = np.loadtxt("preprocessed_x_y_train.csv", delimiter=",")
testing_data = np.loadtxt("preprocessed_x_y_test.csv", delimiter=",")

##### defining the fit function of our multinomial naive bayes

In [3]:
#defining the fit function of our multinomial naive bayes

def fit(x_train, y_train):
    dictionary = {}
    classes = set(y_train)
    
    for current_class in classes:
        dictionary[current_class] = {}
        dictionary["total_data"] = len(y_train)
        
        current_class_rows = (y_train == current_class)
        x_current_class_rows = x_train[current_class_rows]
        y_current_class_rows = y_train[current_class_rows]
        
        for j in range(1, x_train.shape[1]+1):
            dictionary[current_class][j] = x_current_class_rows[:, j-1].sum()
        
        dictionary[current_class]["total_count"] = x_train.shape[1]
        dictionary[current_class]["total_files"] = len(y_current_class_rows)
        
    return dictionary    
        
    

### takes a single row and calculates the probability nedded for multinomial naive bayes
#### p( y=c1 | d=dtest ) = p( d=dtest | y=c1 ) * p( y=c1 )
### We calculate the log probability as multiplication of all the probability of the features will result in enormous number, which will not be efficient

In [4]:

def probability(dictionary, x, current_class):
    output = 0
    num_features = len(dictionary[current_class].keys()) - 1;
    for i, j in zip(range(num_features),x):
       
        #checks where the frequency of the word in x_test is NIL (as the x_test is made according to the 
        #words available in vocabulary and 0 is the frequency when the word is not in x_test)  
        
        if x[i] != 0.0: 

            count_current_class_with_value_xj = dictionary[current_class][i+1]
            count_total_words_in_current_class = dictionary[current_class]['total_count']
            no_of_words_in_vocabulary = len(x)
            numerator = count_current_class_with_value_xj + 1
            denominator = count_total_words_in_current_class + no_of_words_in_vocabulary
            current_xj_probability = np.log(numerator) - np.log(denominator) #log probability
            output = output + current_xj_probability
            ouput = output + np.log(dictionary[current_class]["total_files"]/dictionary["total_data"]) #explained above
    return output


## takes the dictionary and a single row of x_test and returns the best class for that row

In [5]:
def predictSinglePoint(dictionary, x):
    classes = dictionary.keys()
    best_p = -1000
    best_class = -1
    first_run = True
    for current_class in classes:
        if current_class == 'total_data':
            continue
        p_current_class = probability(dictionary, x, current_class)
        if (first_run or (p_current_class > best_p)):
            
            best_p = p_current_class
            best_class = current_class
        first_run = False
    return best_class

## predicts the best class of entire x_test

In [6]:
def predict(dictionary, x_test):
    y_pred = []
    for x in x_test:
        x_class = predictSinglePoint(dictionary, x)
        y_pred.append(x_class)
        
    return y_pred

In [7]:
x_train = training_data[:,:-1]
y_train = training_data[:,-1]
x_test = testing_data[:,:-1]
y_test = testing_data[:,-1]

In [8]:
dic = fit(x_train=x_train, y_train=y_train)

In [None]:
y_pred = predict(dictionary=dic, x_test=x_test)

In [10]:
y_pred

[1.0,
 12.0,
 18.0,
 12.0,
 12.0,
 12.0,
 12.0,
 12.0,
 11.0,
 12.0,
 12.0,
 18.0,
 1.0,
 17.0,
 12.0,
 12.0,
 18.0,
 3.0,
 12.0,
 1.0,
 5.0,
 12.0,
 12.0,
 12.0,
 12.0,
 12.0,
 12.0,
 12.0,
 12.0,
 5.0,
 1.0,
 18.0,
 18.0,
 5.0,
 12.0,
 5.0,
 12.0,
 17.0,
 12.0,
 12.0,
 16.0,
 18.0,
 12.0,
 12.0,
 1.0,
 12.0,
 12.0,
 17.0,
 12.0,
 12.0,
 12.0,
 12.0,
 11.0,
 5.0,
 12.0,
 12.0,
 16.0,
 12.0,
 12.0,
 12.0,
 18.0,
 12.0,
 5.0,
 5.0,
 5.0,
 18.0,
 12.0,
 12.0,
 12.0,
 14.0,
 16.0,
 5.0,
 1.0,
 11.0,
 17.0,
 17.0,
 12.0,
 17.0,
 1.0,
 12.0,
 12.0,
 12.0,
 12.0,
 12.0,
 5.0,
 12.0,
 12.0,
 17.0,
 12.0,
 11.0,
 12.0,
 5.0,
 11.0,
 12.0,
 10.0,
 16.0,
 17.0,
 12.0,
 14.0,
 17.0,
 12.0,
 12.0,
 11.0,
 12.0,
 11.0,
 12.0,
 6.0,
 12.0,
 12.0,
 11.0,
 12.0,
 12.0,
 5.0,
 5.0,
 11.0,
 12.0,
 12.0,
 12.0,
 5.0,
 3.0,
 12.0,
 12.0,
 1.0,
 16.0,
 12.0,
 12.0,
 12.0,
 5.0,
 12.0,
 12.0,
 12.0,
 5.0,
 12.0,
 1.0,
 12.0,
 12.0,
 11.0,
 12.0,
 12.0,
 5.0,
 17.0,
 12.0,
 12.0,
 18.0,
 12.0,
 12.0,
 12.0,


In [11]:
print(classification_report(y_true=y_test, y_pred=y_pred))

             precision    recall  f1-score   support

        0.0       0.70      0.78      0.74       300
        1.0       0.62      0.84      0.71       300
        2.0       1.00      0.49      0.65       300
        3.0       0.86      0.76      0.81       300
        4.0       0.97      0.63      0.76       300
        5.0       0.55      0.92      0.69       300
        6.0       0.90      0.68      0.78       300
        7.0       0.94      0.65      0.77       300
        8.0       0.99      0.29      0.45       300
        9.0       0.98      0.40      0.57       300
       10.0       0.73      0.96      0.83       300
       11.0       0.73      0.90      0.80       300
       12.0       0.98      0.54      0.70       300
       13.0       0.95      0.67      0.79       300
       14.0       0.86      0.83      0.84       300
       15.0       0.89      0.97      0.93       297
       16.0       0.76      0.74      0.75       300
       17.0       0.41      0.97      0.57   

In [12]:
print(accuracy_score(y_test, y_pred))

0.6991829247957312


In [13]:
print(confusion_matrix(y_test, y_pred))

[[233   0   0   0   0   2   0   0   0   0   0   1   0   1   0   7   0  29
    6  21]
 [  2 251   0   2   0  14   0   0   0   1   1   6   0   3   1   0   0  10
    7   2]
 [  0  34 146   8   2  90   1   0   0   0   0   6   0   0   1   0   0   4
    6   2]
 [  0  23   0 229   2  22   2   0   0   0   0   5   0   0   1   0   0   5
   10   1]
 [  1  29   0   8 189  22   1   0   0   0   1  19   0   1   2   0   2  13
   12   0]
 [  1   9   0   2   0 276   0   0   0   0   0   1   0   1   2   0   0   2
    6   0]
 [  2  17   0  11   2  12 204   2   0   0   3   4   1   0   5   0   1  21
   15   0]
 [  1   3   0   0   0   0   9 195   0   0   1   1   1   1   3   0   3  44
   38   0]
 [  7   6   0   0   0  11   6  11  87   0   6  11   1   2   6   0  17  56
   71   2]
 [  1   4   0   1   0   9   2   0   1 119  86   4   0   0   3   0   1  35
   33   1]
 [  0   1   0   0   0   1   0   0   0   0 287   1   0   0   0   0   0   2
    8   0]
 [  0   0   0   0   0   4   0   0   0   0   0 270   0   0   2   0