Simple Naive Bayes classifier for CS 441 - Ian Vetter


In [None]:
%matplotlib inline
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from aml_utils import test_case_checker

In [None]:
Pima Indians Diabetes Dataset:
https://www.kaggle.com/uciml/pima-indians-diabetes-database/data.


df = pd.read_csv('../BasicClassification-lib/diabetes.csv')
df.head()

In [None]:
np_random = np.random.RandomState(seed=12354)
rand_unifs = np_random.uniform(0,1,size=df.shape[0])
division_thresh = np.percentile(rand_unifs, 80)
train_indicator = rand_unifs < division_thresh
eval_indicator = rand_unifs >= division_thresh


In [None]:
train_df = df[train_indicator].reset_index(drop=True)
train_features = train_df.loc[:, train_df.columns != 'Outcome'].values
train_labels = train_df['Outcome'].values
train_df.head()

In [None]:
eval_df = df[eval_indicator].reset_index(drop=True)
eval_features = eval_df.loc[:, eval_df.columns != 'Outcome'].values
eval_labels = eval_df['Outcome'].values
eval_df.head()

Preprocessing

In [None]:
train_df_with_nans = train_df.copy(deep=True)
eval_df_with_nans = eval_df.copy(deep=True)
for col_with_nans in ['BloodPressure', 'SkinThickness', 'BMI', 'Age']:
    train_df_with_nans[col_with_nans] = train_df_with_nans[col_with_nans].replace(0, np.nan)
    eval_df_with_nans[col_with_nans] = eval_df_with_nans[col_with_nans].replace(0, np.nan)
train_features_with_nans = train_df_with_nans.loc[:, train_df_with_nans.columns != 'Outcome'].values
eval_features_with_nans = eval_df_with_nans.loc[:, eval_df_with_nans.columns != 'Outcome'].values

Naive Bayes

$$\log p_y =\begin{bmatrix}\log p(y=0)\\\log p(y=1)\end{bmatrix}$$

In [None]:
def log_prior(train_labels):
    
    leng = train_labels.shape[0]
    log_py = np.array([[np.log(np.where(train_labels == 0)[0].shape[0]/leng)], [np.log(np.where(train_labels == 1)[0].shape[0]/leng)]])
    
    assert log_py.shape == (2,1)
    
    return log_py

$$\mu_y = \begin{bmatrix} \mathbb{E}[x^{(0)}|y=0] & \mathbb{E}[x^{(0)}|y=1]\\
\mathbb{E}[x^{(1)}|y=0] & \mathbb{E}[x^{(1)}|y=1] \\
\cdots & \cdots\\
\mathbb{E}[x^{(7)}|y=0] & \mathbb{E}[x^{(7)}|y=1]\end{bmatrix}$$

In [None]:
def cc_mean_ignore_missing(train_features, train_labels):
    N, d = train_features.shape
    
    yzer_idxs = np.where(train_labels == 0)[0]
    yone_idxs = np.where(train_labels == 1)[0]
                                 
    means_yzer = np.mean(train_features[yzer_idxs], axis=0)
    means_yone = np.mean(train_features[yone_idxs], axis=0)
    
    mu_y = np.array([means_yzer, means_yone]).T
     
    
    assert mu_y.shape == (d, 2)
    return mu_y

$$\sigma_y = \begin{bmatrix} \text{std}[x^{(0)}|y=0] & \text{std}[x^{(0)}|y=1]\\
\text{std}[x^{(1)}|y=0] & \text{std}[x^{(1)}|y=1] \\
\cdots & \cdots\\
\text{std}[x^{(7)}|y=0] & \text{std}[x^{(7)}|y=1]\end{bmatrix}$$

In [None]:
def cc_std_ignore_missing(train_features, train_labels):
    N, d = train_features.shape
    
    yzer_idx = np.where(train_labels == 0)[0]
    yone_idx = np.where(train_labels == 1)[0]
                                 
    std_yzer = np.std(train_features[yzer_idx], axis=0)
    std_yone = np.std(train_features[yone_idx], axis=0)
    
    sigma_y = np.array([std_yzer, std_yone]).T
    
    assert sigma_y.shape == (d, 2)
    
    return sigma_y

$$\log p_{x,y} = \begin{bmatrix} \bigg[\log p(y=0) + \sum_{j=0}^{7} \log p(x_1^{(j)}|y=0) \bigg] & \bigg[\log p(y=1) + \sum_{j=0}^{7} \log p(x_1^{(j)}|y=1) \bigg] \\
\bigg[\log p(y=0) + \sum_{j=0}^{7} \log p(x_2^{(j)}|y=0) \bigg] & \bigg[\log p(y=1) + \sum_{j=0}^{7} \log p(x_2^{(j)}|y=1) \bigg] \\
\cdots & \cdots \\
\bigg[\log p(y=0) + \sum_{j=0}^{7} \log p(x_N^{(j)}|y=0) \bigg] & \bigg[\log p(y=1) + \sum_{j=0}^{7} \log p(x_N^{(j)}|y=1) \bigg] \\
\end{bmatrix}$$

In [None]:
def log_prob(train_features, mu_y, sigma_y, log_py):

    #Gaussian Fits

    
    N, d = train_features.shape
    
    log_p_x_y = np.array([log_py[0] + np.sum(np.log( 1/(sigma_y[:,0] * np.sqrt(2*np.pi))) + (-(train_features[0] - mu_y[:,0])**2)/(2*sigma_y[:,0]**2)), 
                           log_py[1] + np.sum(np.log( 1/(sigma_y[:,1] * np.sqrt(2*np.pi))) + (-(train_features[0] - mu_y[:,1])**2)/(2*sigma_y[:,1]**2))])
        
    log_p_x_y = log_p_x_y.reshape(1, 2)
    
    for i in range(1, N):

        log_p_x_y = np.append(log_p_x_y, [[(log_py[0] + np.sum(np.log( 1/(sigma_y[:,0]*np.sqrt(2*np.pi)))+(-(train_features[i] - mu_y[:,0])**2)/(2*sigma_y[:,0]**2)))[0], 
                           (log_py[1] + np.sum(np.log( 1/(sigma_y[:,1]*np.sqrt(2*np.pi)))+(-(train_features[i] - mu_y[:,1])**2)/(2*sigma_y[:,1]**2)))[0]]], axis=0)
        

    assert log_p_x_y.shape == (N,2)
    return log_p_x_y

## **Classifier!**

In [None]:
class NBClassifier():
    def __init__(self, train_features, train_labels):
        self.train_features = train_features
        self.train_labels = train_labels
        self.log_py = log_prior(train_labels)
        self.mu_y = self.get_cc_means()
        self.sigma_y = self.get_cc_std()
        
    def get_cc_means(self):
        mu_y = cc_mean_ignore_missing(self.train_features, self.train_labels)
        return mu_y
    
    def get_cc_std(self):
        sigma_y = cc_std_ignore_missing(self.train_features, self.train_labels)
        return sigma_y
    
    def predict(self, features):
        log_p_x_y = log_prob(features, self.mu_y, self.sigma_y, self.log_py)
        return log_p_x_y.argmax(axis=1)

In [None]:
dia_classifier = NBClassifier(train_features, train_labels)
train_pred = dia_classifier.predict(train_features)
eval_pred = dia_classifier.predict(eval_features)

In [None]:
train_accuracy = (train_pred==train_labels).mean()
eval_accuracy = (eval_pred==eval_labels).mean()
print(f'The training data accuracy of your trained model is {train_accuracy}')
print(f'The evaluation data accuracy of your trained model is {eval_accuracy}')