In [1]:
# Setup.

import numpy as np
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier
from IPython.display import HTML

#Using tools provided to us for Problem Set 3.
from boosting_helper import (
    generate_dataset,
    visualize_dataset,
    gb_suite, ab_suite,
    visualize_loss_curves_gb, visualize_loss_curves_ab,
    animate_gb, animate_ab
)

In [5]:
training_data = np.loadtxt('training_data.txt', skiprows=1)
test_data = np.loadtxt('test_data.txt', skiprows=1)

X_train = training_data[:,1:]
y_train = training_data[:,0]

X_test = test_data

# Using the AdaBoost class written for Problem Set 3.

In [3]:
class AdaBoost():
    def __init__(self, n_clfs=100):
        '''
        Initialize the AdaBoost model.
        Inputs:
            n_clfs (default 100): Initializer for self.n_clfs.          
        Attributes:
            self.n_clfs: The number of DT weak classifiers.
            self.coefs: A list of the AdaBoost coefficients.
            self.clfs: A list of the DT weak classifiers, initialized as empty.
        '''
        self.n_clfs = n_clfs
        self.coefs = []
        self.clfs = []

    def fit(self, X, Y, n_nodes=4):
        '''
        n_nodes: The max number of nodes that the DT weak classifiers are allowed to have.
        Outputs:
            A (N, T) shaped numpy array, where T is the number of iterations / DT weak classifiers,
            such that the t^th column contains D_{t+1} (the dataset weights at iteration t+1).
        '''
    #==============================================
    # TODO: implement the fit function.
    #==============================================   
        Ynew = np.copy(Y)
        N = len(Ynew)
        Ds = np.ones(N)/np.float(N) 
        weights = np.zeros((N, self.n_clfs))
    
        for i in np.arange(self.n_clfs):
            clf = DecisionTreeClassifier(max_leaf_nodes=4)
            #print(len(Y), len(Ds))
            clf.fit(X, Ynew, Ds)
            self.clfs.append(clf)
            
            error = np.sum(Ds[np.where(Ynew != clf.predict(X))])
            alpha = 0.5*np.log((1 - error)/error)
            
            Ds = Ds*np.exp(-alpha*Ynew*clf.predict(X))
            Ds /= np.sum(Ds)

            self.coefs.append(alpha)
            weights[:,i] = Ds

        return weights
    
    def predict(self, X):
        '''
        Predict on the given dataset.

        Inputs:
            X: A (N, D) shaped numpy array containing the data points.
            
        Outputs:
            A (N, ) shaped numpy array containing the (float) labels of the data points.
            (Even though the labels are ints, we treat them as floats.)
        '''
        # Initialize predictions.
        Y_pred = np.zeros(len(X))
        
        # Add predictions from each DT weak classifier.
        for i, clf in enumerate(self.clfs):
            Y_curr = self.coefs[i] * clf.predict(X)
            Y_pred += Y_curr

        # Return the sign of the predictions.
        return np.sign(Y_pred)

    def loss(self, X, Y):
        
        # Calculate the points where the predictions and the ground truths don't match.
        Y_pred = self.predict(X)
        misclassified = np.where(Y_pred != Y)[0]

        # Return the fraction of such points.
        return float(len(misclassified)) / len(X)

In [None]:
clf = AdaBoost()
clf.fit(X_train, y_train)

In [None]:
model, D = ab_suite(AdaBoost, 500, X_train, Y_train, X_test, Y_test)