In [1]:
import numpy as np
import pandas as pd

In [2]:
df = pd.read_csv("./dataset/email.csv")
df.sample(4)

Unnamed: 0,text,spam
482,"Subject: debt information tue , 28 jun 2005 . ...",1
4669,Subject: informal interview with the enron res...,0
309,Subject: viagra is the # 1 med to struggle wit...,1
5543,Subject: weijun decided not to interview i gu...,0


In [3]:
df.shape

(5728, 2)

In [4]:
# pre processing
df.duplicated().sum() # calculate total count of duplicates
df.drop_duplicates(keep = 'first')
df.dropna(inplace = True)

In [5]:
x = df.drop(['spam'], axis = 1)

In [6]:
y = df.drop(x.columns, axis = 1)

In [7]:
from sklearn.model_selection import train_test_split

In [8]:
x_train, x_test, y_train, y_test = train_test_split(x, y, random_state = 42, test_size = 0.3)

In [11]:
from sklearn.naive_bayes import GaussianNB

In [9]:
model = GaussianNB()
model.fit(x_train, y_train)

In [None]:
y_pred = model.predict(x_test)

In [None]:
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score

In [15]:
class Props:
    def __init__(self, y_test, y_pred):
        self.y_test = y_test
        self.y_pred = y_pred

    def conf_matrix(self):
        tp = 0
        tn = 0
        fp = 0
        fn = 0

        for i in range(0, len(self.y_test)):
            if self.y_test[i] == 1:
                if self.y_pred[i] == 1:
                    tp += 1
                else:
                    fn += 1
            else:
                if self.y_pred[i] == 1:
                    fp += 1
                else:
                    tn += 1

        return tp, fn, fp, tn
        
    def accuracy(self):
        return (self.tp + self.tn) / (self.tp + self.tn + self.fn + self.fp)

    def precision(self):
        if(self.fp + self.tp) == 0:
            return 0
        return self.tp / (self.tp + self.fp)

    def recall(self):
        if(self.tp + self.fn) == 0:
            return 0
        return self.tp / (self.tp + self.fn)

    def sensitivity(self):
        return self.recall()

    def f1(self):
        pre = self.precision()
        re = self.recall()

        if pre == 0 or re == 0:
            return 0

        return (2 * pre * re) / (pre + re)

    def specificity(self):
        if(self.tn + self.fp) == 0:
            return 0
        return self.tn / (self.tn + self.fp)

    def negative_predictive_value(self):
        if(self.tn + self.fn) == 0:
            return 0
        return self.tn / (self.tn + self.fn)

    def mcc(self):
        # return (tp * tn - fp * fn) / sqrt( (tp + fp) * (tp + fn) * (tn + fp) * (tn + fn) )
        try:
            return (self.tp * self.tn - self.fp * self.fn) / np.sqrt( (self.tp + self.fp) * (self.tp + self.fn) * (self.tn + self.fp) * (self.tn + self.fn) )
        except ZeroDivisionError:
            return 0