In [1]:
# Setup.

import numpy as np
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier
from IPython.display import HTML

#Using tools provided to us for Problem Set 3.
from boosting_helper import (
    generate_dataset,
    visualize_dataset,
    gb_suite, ab_suite,
    visualize_loss_curves_gb, visualize_loss_curves_ab,
    animate_gb, animate_ab
)

In [3]:
training_data = np.loadtxt('training_data.txt', skiprows=1)
test_data = np.loadtxt('test_data.txt', skiprows=1)

X_train = training_data[:,1:]
y_train = training_data[:,0]

X_test = test_data

In [10]:
clf = AdaBoostClassifier(n_estimators=200)
clf.fit(X_train, y_train)
clf.score(X_train, y_train)
predictions = clf.predict(X_test)

In [11]:
predictions = clf.predict(X_test)
index = np.arange(len(predictions))+1

printarray = np.asarray([index,predictions]).T
print(printarray[0])
np.savetxt('Ada1_predictions.txt', printarray, fmt='%1i', delimiter=',', header='Id,Prediction')

[ 1.  1.]


# Search for the optimal parameters.

In [3]:
def classification_err(y, real_y):
    """
    This function returns the classification error between two equally-sized vectors of 
    labels; this is the fraction of samples for which the labels differ.
    
    Inputs:
        y: (N, ) shaped array of predicted labels
        real_y: (N, ) shaped array of true labels
    Output:
        Scalar classification error
    """
    #==============================================
    # TODO: Implement the classification_err function,
    # based on the above instructions.
    #==============================================  
    
    #As stated above, return the fraction of samples for which the labels differ.
    size = len(y)
    diff = real_y - y
    wrong = len(y)
    
    for i in np.arange(size):
        if diff[i] == 0:
            wrong -= 1
    
    return wrong/size

#Hard-coding number of estimators for random forest classifiers.
n_estimators = 1000

def eval_tree_based_model_min_samples(clf, min_samples_leaf, X_train, y_train, X_test, y_test):
    """
    This function evaluates the given classifier (either a decision tree or random forest) at all of the 
    minimum leaf size parameters in the vector min_samples_leaf, using the given training and testing
    data. It returns two vector, with the training and testing classification errors.
    
    Inputs:
        clf: either a decision tree or random forest classifier object
        min_samples_leaf: a (T, ) vector of all the min_samples_leaf stopping condition parameters 
                            to test, where T is the number of parameters to test
        X_train: (N, D) matrix of training samples.
        y_train: (N, ) vector of training labels.
        X_test: (N, D) matrix of test samples
        y_test: (N, ) vector of test labels
    Output:
        train_err: (T, ) vector of classification errors on the training data
        test_err: (T, ) vector of classification errors on the test data
    """

    train_err = np.zeros(len(min_samples_leaf))
    test_err = np.zeros(len(min_samples_leaf))    
    
    #Iterate over all parameters of the data, returning classification error for each parameter.
    for i in np.arange(len(min_samples_leaf)):
        
        clf.set_params(min_samples_leaf=min_samples_leaf[i])
        clf.fit(X_train, y_train)
        
        train_err[i] = classification_err(clf.predict(X_train), y_train)
        test_err[i] = classification_err(clf.predict(X_test), y_test)
    
    return train_err, test_err
