In [1]:
import numpy as np
import pandas as pd
import csv
from sklearn.tree import DecisionTreeClassifier

The following functions complete the process of ada boosting including fitting the data, making predictions and writing the predictions to a text file

In [2]:
def adaboost(X, y, num_iter):
    """Given an numpy matrix X, a array y and num_iter return trees and weights 
   
    Input: X, y, num_iter
    Outputs: array of trees from DecisionTreeClassifier
             trees_weights array of floats
    Assumes y is in {-1, 1}^n
    """
    trees = []
    trees_weights = []
    # your code here
    y_len = len(y)
    weights = [1.0 / y_len] * y_len

    for i in range(num_iter):
        d_tree = DecisionTreeClassifier(max_depth=1)
        d_tree.fit(X, y, sample_weight=weights)
        trees.append(d_tree)
        
        train_pred = d_tree.predict(X)
        diff = abs(np.array(y) - np.array(train_pred))
        index = np.where(diff != 0)[0]

        e = sum(weights[idx] for idx in index)
        error = e / sum(weights)
        try:
            alpha = np.log((1 - error) / error)
        except ZeroDivisionError:
            alpha = 1.0

        trees_weights.append(alpha)

        weights = [weights[i] * np.exp(alpha) if train_pred[i] != y[i] 
                   else weights[i] for i in range(len(weights))]
        
    return trees, trees_weights


def adaboost_predict(X, trees, trees_weights):
    """Given X, trees and weights predict Y

    assume Y in {-1, 1}^n
    """
    # your code here
    Yhat = np.array(0) * len(X)

    for i in range(len(trees)):
        pred = trees[i].predict(X)
        weighted_pred = np.array(pred) * trees_weights[i]
        Yhat += weighted_pred
    Yhat = np.sign(Yhat)
    
    return Yhat


def parse_spambase_data(filename):
    """ Given a filename return X and Y numpy arrays

    X is of size number of rows x num_features
    Y is an array of size the number of rows
    Y is the last element of each row.
    """
    # your code here
    data = pd.read_csv(filename, header=None)
    X = data.iloc[:, :-1]
    Y = data.iloc[:, -1]

    return X, Y


def new_label(Y):
    """ Transforms a vector od 0s and 1s in -1s and 1s.
    """
    return [-1. if y == 0. else 1. for y in Y]


def old_label(Y):
    return [0. if y == -1. else 1. for y in Y]


def accuracy(y, pred):
    return np.sum(y == pred) / float(len(y))


def write_predictions(filename, x_test, y_test, prediction):
    xtest = pd.DataFrame(x_test)

    ytest = old_label(y_test)
    ytest = pd.DataFrame(ytest)

    pred = old_label(prediction)
    pred = pd.DataFrame(pred)

    with open(filename, "wb") as csvfile:
        writer = csv.writer(csvfile)
        for i in range(len(prediction)):
            row = list(xtest.iloc[i, :])
            row.extend(list(ytest.iloc[i, :]))
            row.extend(list(pred.iloc[i, :]))
            writer.writerow(row)

Test the implementation using spam data

In [3]:
X, Y = parse_spambase_data('spambase.train')
X_test, Y_test = parse_spambase_data('spambase.test')
Y = new_label(Y)
Y_test = new_label(Y_test)

In [4]:
num_iter = [10, 20, 50, 100, 150, 200, 300, 400, 500, 600, 800, 1000]
train_error = []
test_error = []
for num in num_iter:
    trees, alphas = adaboost(X, Y, num)
    Yhat_test = adaboost_predict(X_test, trees, alphas)
    Yhat = adaboost_predict(X, trees, alphas)

    ## here print accuracy and write predictions to a file
    acc_test = accuracy(Y_test, Yhat_test)
    acc = accuracy(Y, Yhat)
    
    print acc_test
    train_error.append(1-acc)
    test_error.append(1-acc_test)

0.919
0.932
0.944
0.947
0.95
0.951
0.949
0.95
0.953
0.954
0.954
0.954
