In [23]:
import numpy as np
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor

In [2]:
data = np.loadtxt(open("train_2008.csv", "rb"), delimiter=",", skiprows=1)

In [3]:
X = data[:, 1:-1]
y = data[:, -1]

In [67]:
N = len(y) #381
D = len(X[0]) #64667
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

In [51]:
X_sample, y_sample = X_train[:1000], y_train[:1000]

In [68]:
# Clean up data (get rid of columns with 0 std)
X_std = np.std(X, axis=0)
X = X[:, np.where(X_std!=0)]

# Normalize
X_mean = np.mean(X, axis=0)
X_std = np.std(X, axis=0)

In [69]:
X_train = (X_train - X_mean) / X_std
X_test = (X_test - X_mean) / X_std

ValueError: operands could not be broadcast together with shapes (45266,1,366) (381,) 

In [11]:
class GradientBoosting():
    def __init__(self, n_clfs=100):
        '''
        Initialize the gradient boosting model.

        Inputs:
            n_clfs (default 100): Initializer for self.n_clfs.        
                
        Attributes:
            self.n_clfs: The number of DT weak regressors.
            self.clfs: A list of the DT weak regressors, initialized as empty.
        '''
        self.n_clfs = n_clfs
        self.clfs = []
        
    def fit(self, X, Y, n_nodes=4):
        '''
        Fit the gradient boosting model by training self.n_clfs DT weak regressors and store them in self.clfs.

        Inputs:
            X: A (N, D) shaped numpy array containing the data points.
            Y: A (N, ) shaped numpy array containing the (float) labels of the data points.
               (Even though the labels are ints, we treat them as floats.)
            n_nodes: The max number of nodes that the DT weak regressors are allowed to have.
        '''
        # store predictions from previous weak regressors to train on residuals
        predictions = np.zeros(shape=(len(Y), ))
        
        for i in range(self.n_clfs):
            clf = DecisionTreeRegressor(max_leaf_nodes=n_nodes)
            clf.fit(X, Y - predictions)
            self.clfs.append(clf)
            predictions += clf.predict(X)
        
    def predict(self, X):
        '''
        Predict on the given dataset.

        Inputs:
            X: A (N, D) shaped numpy array containing the data points.

        Outputs:
            A (N, ) shaped numpy array containing the (float) labels of the data points.
            (Even though the labels are ints, we treat them as floats.)
        '''
        # Initialize predictions.
        Y_pred = np.zeros(len(X))
        
        # Add predictions from each DT weak regressor.
        for clf in self.clfs:
            Y_curr = clf.predict(X)
            Y_pred += Y_curr

        # Return the sign of the predictions.
        return Y_pred

    def loss(self, X, Y):
        '''
        Calculate the classification loss.

        Inputs:
            X: A (N, D) shaped numpy array containing the data points.
            Y: A (N, ) shaped numpy array containing the (float) labels of the data points.
               (Even though the labels are ints, we treat them as floats.)
            
        Outputs:
            The classification loss.
        '''
        # Calculate the points where the predictions and the ground truths don't match.
        Y_pred = self.predict(X)
        misclassified = np.where(Y_pred != Y)[0]

        # Return the fraction of such points.
        return float(len(misclassified)) / len(X)

In [44]:
model = GradientBoosting()
model.fit(X_train, y_train)

ValueError: Input contains NaN, infinity or a value too large for dtype('float32').

In [32]:
y_score = model.predict(X_test)

In [33]:
roc_auc_score(y_test, y_score)

0.7687220329092246