Ex 8.2 - Spam classification using Naive Bayes
==============================================

In [1]:
import numpy as np
import scipy.io as spio

In [7]:
def load_data():
    '''
    Loads the data from the matlab file and returns
    as numpy arrays with nice formatting
    '''
    A = spio.loadmat('../data/spam_data.mat')
    X_train = A['Xtrain']
    X_test = A['Xtest']
    y_train = A['ytrain'].flatten().astype(int)
    y_test = A['ytest'].flatten().astype(int)

    return X_train, X_test, y_train, y_test

def standarize_data(data):
    '''
    Returns data standarized so each column will have
    mean 0 and unit variance
    '''
    X_train, X_test, y_train, y_test = data
    # Train
    X_average = np.average(X_train, axis=0)
    X_std = np.std(X_train, axis=0)
    X_train = (X_train-X_average)/X_std
    # test
    X_average = np.average(X_test, axis=0)
    X_std = np.std(X_test, axis=0)
    X_test = (X_test-X_average)/X_std

    data = X_train, X_test, y_train, y_test
    return data


def transform_log_scale(data):
    '''
    Transform the data to log, making sure it's not NaN!
    '''
    X_train, X_test, y_train, y_test = data
    X_train = np.log(X_train + 0.001)
    X_test = np.log(X_test + 0.001)
    data = X_train, X_test, y_train, y_test
    return data


def binarize_data(data):
    '''
    Returns binarized data so that each entry will be
    just 0 or 1
    '''
    X_train, X_test, y_train, y_test = data
    X_train = (X_train > 0).astype(int)
    X_test = (X_test > 0).astype(int)
    data = X_train, X_test, y_train, y_test
    return data

def naive_bayes(data):
    '''
    Does naive bayes classification on data,
    returning the error rate
    '''
    X_train, X_test, y_train, y_test = data
    mask_spam_train = (y_train == 1)
    n_spam = np.sum(mask_spam_train)
    pi_spam = n_spam/float(len(y_train))
    alpha = 1  # pseudocount
    theta_spam = (alpha + np.sum(X_train[mask_spam_train], axis=0))
    theta_not_spam = (alpha + np.sum(X_train[~mask_spam_train], axis=0))
    theta_spam = theta_spam/float(n_spam + 2*alpha)
    theta_not_spam = theta_not_spam / float(n_spam + 2*alpha)
    log_prob_spam = (np.log(pi_spam) +
                     np.sum(np.log(theta_spam * X_test +
                                   (1-theta_spam) * (1-X_test)), axis=1))
    log_prob_not_spam = (np.log(1 - pi_spam) +
                         np.sum(np.log(theta_not_spam * X_test +
                                       (1-theta_not_spam) * (1-X_test)),
                                axis=1))
    log_odds_spam = log_prob_spam - log_prob_not_spam
    y_predicted = (log_odds_spam > 0).astype(int)
    error_rate = np.sum(y_predicted != y_test)/len(y_test)
    return error_rate


def gaussian_naive_bayes(data):
    '''
    Does gaussian naive bayes classification on data,
    returning the error rate
    '''
    X_train, X_test, y_train, y_test = data
    mask_spam_train = (y_train == 1)
    n_spam = np.sum(mask_spam_train)
    pi_spam = n_spam/len(y_train)
    n_test = len(X_test)
    mean_spam = np.average(X_train[mask_spam_train], axis=0)
    sigma_spam = np.std(X_train[mask_spam_train], axis=0) + 1e-7
    mean_not_spam = np.average(X_train[~mask_spam_train], axis=0)
    sigma_not_spam = np.std(X_train[~mask_spam_train], axis=0) + 1e-7
    norm_spam = 1./2 * np.sum(np.log(n_test * 2 * np.pi * sigma_spam**2))
    log_prob_spam = (-np.sum((X_test - mean_spam)**2 /
                     (2 * sigma_spam**2), axis=1) +
                     np.log(pi_spam) - norm_spam)
    norm_n_spam = 1./2 * np.sum(np.log(n_test * 2 * np.pi * sigma_not_spam**2))
    log_prob_not_spam = (-np.sum((X_test - mean_not_spam)**2 /
                         (2 * sigma_not_spam**2), axis=1) +
                         np.log(1 - pi_spam) - norm_n_spam)
    log_odds_spam = log_prob_spam - log_prob_not_spam
    y_predicted = (log_odds_spam > 0).astype(int)
    error_rate = np.sum(y_predicted != y_test)/n_test
    return error_rate

In [8]:
X_train, X_test, y_train, y_test = load_data()
data_orig = (X_train, X_test, y_train, y_test)
data_standarized = standarize_data(data_orig)
data_log_scale = transform_log_scale(data_orig)
data_binarized = binarize_data(data_orig)

all_data = [data_orig, data_standarized,
            data_log_scale, data_binarized]
data_names = ['original', 'standarized',
              'log_scale', 'binarized']

In [9]:
error_rate = naive_bayes(data_binarized)
print('Naive Bayes error rate for binarized data: %.2f' %
      error_rate)


# Gaussian naive bayes for continuous data

for data, data_name in zip(all_data, data_names):
    if data_name != 'binarized':
        error_rate = gaussian_naive_bayes(data)
        print('Naive Bayes (gaussian) error rate for %s data: %.2f' %
              (data_name, error_rate))

Naive Bayes error rate for binarized data: 0.09
Naive Bayes (gaussian) error rate for original data: 0.19
Naive Bayes (gaussian) error rate for standarized data: 0.19
Naive Bayes (gaussian) error rate for log_scale data: 0.15
