# SpamDataset_Gaussian_Naive_Bayes_Classifier
Author: Xin Zhengfang

## Data Preprocessing

In [1]:
import scipy.io as sio
import numpy as np

In [2]:
# Load .mat file into numpy array
mat_contents = sio.loadmat('spamData.mat')
mat_contents.keys()

dict_keys(['__header__', '__version__', '__globals__', 'Xtrain', 'Xtest', 'ytrain', 'ytest'])

In [3]:
#  Convert to arrary
Xtrain = mat_contents['Xtrain']
Xtest = mat_contents['Xtest']
ytrain = mat_contents['ytrain']
ytest = mat_contents['ytest']
print(Xtrain.shape, Xtest.shape, ytrain.shape, ytest.shape)

(3065, 57) (1536, 57) (3065, 1) (1536, 1)


In [4]:
# Data preprocessing

# Log-transform
log_Xtrain = np.log(Xtrain+1e-7)
log_Xtest = np.log(Xtest+1e-7)

## Gaussian Naive Bayes
**The class label:**
1. Because dataset has a lot of spam and non-spam emails, we don't need do some prior assumption.The maxmum likelihood $\lambda^{ML}$ can be used as the plug-in estimator for testing.

**The features distribution:**
1. To simplify the question, Maxmum likehood is used with univariate gaussian prior.

**ML estimation of $\mu$ ,$\sigma$ giving training data $D=\left\{x_{1}, \ldots, x_{N}\right\}$ $D=\left\{x_{1}, \ldots, x_{N}\right\}$:**
$$
\begin{aligned} \frac{\partial L}{\partial \mu} &=\frac{\partial}{\partial \mu}\left(\sum_{n=1}^{N}-\frac{\left(x_{n}-\mu\right)^{2}}{2 \sigma^{2}}\right)=\sum_{n=1}^{N} \frac{\left(x_{n}-\mu\right)}{\sigma^{2}}=0 \\ & \Longrightarrow \hat{\mu}=\frac{1}{N} \sum_{n=1}^{N} x_{n} \end{aligned}
$$
$$
\begin{aligned} \frac{\partial L}{\partial \sigma} &=\frac{\partial}{\partial \sigma}\left(\sum_{n=1}^{N}-\frac{\left(x_{n}-\mu\right)^{2}}{2 \sigma^{2}}-N \log \sigma\right)=\sum_{n} \frac{\left(x_{n}-\mu\right)^{2}}{\sigma^{3}}-\frac{N}{\sigma}=0 \\ & \Longrightarrow \hat{\sigma}^{2}=\frac{1}{N} \sum_{n=1}^{N}\left(x_{n}-\mu\right)^{2}=\frac{1}{N} \sum_{n=1}^{N}\left(x_{n}-\hat{\mu}\right)^{2} \end{aligned}
$$
Note: See the detailed derivations in Machine_Learning_AXIN_Probabilistic_Perspec(KM)-CHAPTER 4.1.3

In [5]:
# Trainning ★
'''
    To get lambda_ML, mu_jc_ML, sigma_jc_ML lists
'''
mu_jc_ML = [[],[]]
sigma2_jc_ML = [[],[]]
num_features = Xtrain.shape[-1]

# Pr(y = 1 | lambda_ML)
lambda_ML = np.sum(ytrain)/np.sum(np.ones(ytrain.shape))

c1_mask = ytrain.repeat(num_features,-1)
c0_mask = 1 - c1_mask
c1_log_train = log_Xtrain*c1_mask
c0_log_train = log_Xtrain*c0_mask

for j in range(num_features):
    # Pr(x_j0 | y = 0, mu_j0_ML, sigma_j0_ML) = N(xx_j0|mu.sigma^2)
    mu_jc_ML[0].append(np.sum(c0_log_train[:,j])/np.sum(1-ytrain))
    sigma2_jc_ML[0].append(np.power(np.sum(c0_log_train[:,j]-mu_jc_ML[0][j]),2)/np.sum(1-ytrain))
    # Pr(x_j1 | y = 1, mu_j1_ML, sigma_j1_ML) = N(x_j1|mu.sigma^2)
    mu_jc_ML[1].append(np.sum(c1_log_train[:,j])/np.sum(ytrain))
    sigma2_jc_ML[1].append(np.power(np.sum(c1_log_train[:,j]-mu_jc_ML[1][j]),2)/np.sum(ytrain))
mu_jc_ML = np.array(mu_jc_ML)
sigma2_jc_ML = np.array(sigma2_jc_ML)

In [6]:
# Predict

def UG_pred(log_features,lam,mu,sigma):
    '''
        Input: 
            log_zffeatures #log_features of 1 sample
            lam,mu_jc_ML,sigma_jc_ML #params of Univarate Gaussian model
        Output:
            pred #predicted label
    '''
    pr_c0 = np.prod(1/np.sqrt(2*np.pi*sigma2_jc_ML[0])*np.exp(-0.5*(log_features-mu_jc_ML[0])/sigma2_jc_ML[0]))
    pr_c1 = np.prod(1/np.sqrt(2*np.pi*sigma2_jc_ML[1])*np.exp(-0.5*(log_features-mu_jc_ML[1])/sigma2_jc_ML[1]))
    if pr_c0 > pr_c1:
        pred = 0
    else:
        pred = 1
    return pred

In [7]:
# Xtrain_pred 
Xtrain_pred = []
for spl in log_Xtrain:
    Xtrain_pred.append(UG_pred(spl,lambda_ML,mu_jc_ML,sigma2_jc_ML))

In [8]:
Xtrain_pred = np.array(Xtrain_pred).reshape(-1,1)

In [9]:
Xtrain_err = 1 - np.sum((Xtrain_pred == ytrain).astype('int'))/np.array(Xtrain_pred).shape[0]

In [10]:
# Xtest_pred
Xtest_pred = []
for spl in log_Xtest:
    Xtest_pred.append(UG_pred(spl,lambda_ML,mu_jc_ML,sigma2_jc_ML))

In [11]:
Xtest_pred = np.array(Xtest_pred).reshape(-1,1)

In [12]:
Xtest_err = 1 - np.sum((Xtest_pred == ytest).astype('int'))/np.array(Xtest_pred).shape[0]

## Training and testing error rates for the log-transformed data.

In [79]:
print("Training error rates: ",Xtrain_err)
print("Testing error rates: ",Xtest_err)

Training error rates:  0.4045676998368679
Testing error rates:  0.373046875


The result is not good. Maybe the Gaussian distribution doesn't fit spamdataset well.