Ex 8.1 - Spam classification using Logistic Regression
=====================================================

In [1]:
import numpy as np
import scipy.io as spio

def load_data():
    '''
    Loads the data from the matlab file and returns
    as numpy arrays with nice formatting
    '''
    A = spio.loadmat('../data/spam_data.mat')
    X_train = A['Xtrain']
    X_test = A['Xtest']
    y_train = A['ytrain'].flatten().astype(int)
    y_test = A['ytest'].flatten().astype(int)

    return X_train, X_test, y_train, y_test

def standarize_data(data):
    '''
    Returns data standarized so each column will have
    mean 0 and unit variance
    '''
    X_train, X_test, y_train, y_test = data
    # Train
    X_average = np.average(X_train, axis=0)
    X_std = np.std(X_train, axis=0)
    X_train = (X_train-X_average)/X_std
    # test
    X_average = np.average(X_test, axis=0)
    X_std = np.std(X_test, axis=0)
    X_test = (X_test-X_average)/X_std

    data = X_train, X_test, y_train, y_test
    return data


def transform_log_scale(data):
    '''
    Transform the data to log, making sure it's not NaN!
    '''
    X_train, X_test, y_train, y_test = data
    X_train = np.log(X_train + 0.001)
    X_test = np.log(X_test + 0.001)
    data = X_train, X_test, y_train, y_test
    return data


def binarize_data(data):
    '''
    Returns binarized data so that each entry will be
    just 0 or 1
    '''
    X_train, X_test, y_train, y_test = data
    X_train = (X_train > 0).astype(int)
    X_test = (X_test > 0).astype(int)
    data = X_train, X_test, y_train, y_test
    return data

In [2]:
X_train, X_test, y_train, y_test = load_data()
data_orig = (X_train, X_test, y_train, y_test)
data_standarized = standarize_data(data_orig)
data_log_scale = transform_log_scale(data_orig)
data_binarized = binarize_data(data_orig)

all_data = [data_orig, data_standarized,
            data_log_scale, data_binarized]
data_names = ['original', 'standarized',
              'log_scale', 'binarized']

In [111]:
def sigm(x):
    res = 1./(1 + np.exp(-x))
    return res


def jac(X, y, w, lamb):
    mu = sigm(X.dot(w))
    res = X.T.dot(mu - y) + 2*lamb*w
    return res

def gradient_descent(X, y, lamb):
    steps = 20000
    eta = 1./lamb
    eta_final = 0.01/lamb
    eta_step = (eta_final - eta)/steps
    n_samples, n_features = X.shape
    w = np.random.normal(size=n_features)
    for t in range(steps):
        grad = jac(X, y, w, lamb)
        w = w - eta * grad
        eta = eta + eta_step
    return w
    
def logistic_regression_l2(data, lamb):
    X_train, X_test, y_train, y_test = data
    n_test = len(y_test)
    X_train = np.hstack([np.ones((len(X_train),1)), X_train])
    X_test = np.hstack([np.ones((len(X_test),1)), X_test])
    w  = gradient_descent(X_train, y_train, lamb)
    prob_predicted = sigm(X_test.dot(w))
    y_predicted = ((2*prob_predicted - 1)>0).astype(int)
    error_rate = np.sum(y_predicted != y_test)/n_test
    return error_rate

In [112]:
lamb = 50.
for data, data_name in zip(all_data, data_names):
    error_rate = logistic_regression_l2(data, lamb)
    print('Logistic regression error rate for %s data: %.2f' %
          (data_name, error_rate))

  from ipykernel import kernelapp as app


Logistic regression error rate for original data: 0.59
Logistic regression error rate for standarized data: 0.09
Logistic regression error rate for log_scale data: 0.10
Logistic regression error rate for binarized data: 0.09
