In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split as tts
from scipy.stats import norm

In [2]:
df_x = pd.read_csv('spambase.data', header=None, engine='python')
df_x = df_x.sample(frac=1, random_state=0)
df_y = pd.DataFrame(data=df_x[df_x.columns[-1]])
spam_x, nspam_x = [x for _, x in df_x.groupby(df_x[df_x.columns[-1]] == 0)]
spam_y = pd.DataFrame(data=spam_x[spam_x.columns[-1]])
nspam_y = pd.DataFrame(data=nspam_x[nspam_x.columns[-1]])
df_x.drop(df_x.columns[[-1,]], axis=1, inplace=True)
spam_x.drop(spam_x.columns[[-1,]], axis=1, inplace=True)
nspam_x.drop(nspam_x.columns[[-1,]], axis=1, inplace=True)

In [3]:
df_x_train, df_x_test, df_y_train, df_y_test = tts(df_x, df_y, test_size=0.33)
spam_x_train, spam_x_test, spam_y_train, spam_y_test = tts(spam_x, spam_y, test_size=0.33)
nspam_x_train, nspam_x_test, nspam_y_train, nspam_y_test = tts(nspam_x, nspam_y, test_size=0.33)

In [4]:
spam_prior = spam_x.shape[0]/df_x.shape[0]
nspam_prior = nspam_x.shape[0]/df_x.shape[0]
mean = df_x_train.mean(axis=0)
std = df_x_train.std(axis=0)
spam_x_train = spam_x_train.subtract(mean).divide(std)
nspam_x_train = nspam_x_train.subtract(mean).divide(std)
spam_x_train_mean = spam_x_train.mean(axis=0)
nspam_x_train_mean = nspam_x_train.mean(axis=0)
spam_x_train_std = spam_x_train.std(axis=0)
nspam_x_train_std = nspam_x_train.std(axis=0)
df_x_s_test = df_x_test.subtract(mean).divide(std)

In [5]:
spam_norm = norm.pdf(df_x_s_test, spam_x_train_mean, spam_x_train_std)
spam_norm = np.add(spam_norm, np.finfo(float).eps)
nspam_norm = norm.pdf(df_x_s_test, nspam_x_train_mean, nspam_x_train_std)
nspam_norm = np.add(nspam_norm, np.finfo(float).eps)
spam_p = np.prod(spam_norm, axis=1) * spam_prior
nspam_p = np.prod(nspam_norm, axis=1) * nspam_prior

In [6]:
y_test_p = [1 if spam_p[i] > nspam_p[i] else 0 for i in range(spam_p.shape[0])]
tp, tn, fp, fn = 0, 0, 0, 0
for i in range(df_y_test.shape[0]):
    if y_test_p[i] == df_y_test.values[i]:
        if y_test_p[i] == 1:
            tp += 1
        else:
            tn += 1
    else:
        if y_test_p[i] == 1:
            fp += 1
        else:
            fn += 1
accuracy = (tp + tn)/df_y_test.shape[0]
precision = tp / (tp + fp)
recall = tp / (tp + fn)
f1 = 2 * precision * recall / (precision + recall)
print(tp, tn, fp, fn)
print(accuracy, precision, recall, f1)

604 550 351 14
0.7597103357472021 0.6324607329842932 0.9773462783171522 0.7679593134138588
