In [1]:
%matplotlib

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.datasets import make_hastie_10_2
from scipy.optimize import minimize
from sklearn.cross_validation import StratifiedShuffleSplit
from sklearn.metrics import log_loss
from sklearn.metrics import confusion_matrix
import os
import sys

Using matplotlib backend: Qt5Agg




### Example from Peter Prettenhofer - Gradient Boosted Regression Trees in scikit-learn

In [2]:
X, y = make_hastie_10_2(n_samples = 10000)
est = GradientBoostingClassifier(n_estimators = 200, max_depth = 3)
est.fit(X, y)

GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impurity_split=1e-07, min_samples_leaf=1,
              min_samples_split=2, min_weight_fraction_leaf=0.0,
              n_estimators=200, presort='auto', random_state=None,
              subsample=1.0, verbose=0, warm_start=False)

In [3]:
pred = est.predict(X)
cf0 = confusion_matrix(y, pred)
cf0

array([[4981,   71],
       [ 145, 4803]])

### GradientBoostingRegressor

In [4]:
from sklearn.ensemble import GradientBoostingRegressor

X, y = make_hastie_10_2(n_samples = 100)
est = GradientBoostingRegressor(n_estimators = 2000, max_depth = 1).fit(X, y)
for pred in est.staged_predict(X):
    plt.plot(X[:,0], pred, color = 'r', alpha = 0.1)

## Boosting in OTTO dataset

### Reading files using pandas

In [6]:
train = pd.read_csv("Data_Files/train.csv")
print("Training set has {0[0]} rows and {0[1]} columns".format(train.shape))

labels = train['target']
train.drop(['target', 'id'], axis=1, inplace=True)

print(train.head())

Training set has 61878 rows and 95 columns
   feat_1  feat_2  feat_3  feat_4  feat_5  feat_6  feat_7  feat_8  feat_9  \
0       1       0       0       0       0       0       0       0       0   
1       0       0       0       0       0       0       0       1       0   
2       0       0       0       0       0       0       0       1       0   
3       1       0       0       1       6       1       5       0       0   
4       0       0       0       0       0       0       0       0       0   

   feat_10   ...     feat_84  feat_85  feat_86  feat_87  feat_88  feat_89  \
0        0   ...           0        1        0        0        0        0   
1        0   ...           0        0        0        0        0        0   
2        0   ...           0        0        0        0        0        0   
3        1   ...          22        0        1        2        0        0   
4        0   ...           0        1        0        0        0        0   

   feat_90  feat_91  feat_92  f

In [10]:
# split the data preserving class distribution
sss = StratifiedShuffleSplit(labels, n_iter = 3, test_size = 0.051, train_size = 0.02)
for train_index, test_index in sss:
    break

train_x, train_y = train.values[train_index], labels.values[train_index]
test_x, test_y = train.values[test_index], labels.values[test_index]

In [14]:
np.mean(labels == 'Class_6')

0.22843336888716506

In [17]:
# check percentage of classes
print (np.mean(labels == 'Class_6'))
print (np.mean(train_y == 'Class_6'))
print (np.mean(test_y == 'Class_6'))

0.228433368887
0.22877930477
0.22845373891


In [19]:
cols = train.columns.values.tolist()
cols.append('target')

train_data = np.append(train_x, train_y[...,None], 1)
df = pd.DataFrame(train_data)
df.columns = cols
df.to_csv("train_small.csv", index = False)
# df.head()

test_data = np.append(test_x, test_y[...,None], 1)
df = pd.DataFrame(test_data)
df.columns = cols
df.to_csv("test_small.csv", index = False)
# df.head()

In [20]:
strain_x = pd.read_csv("train_small.csv")
print("Small training set has {0[0]} rows and {0[1]} columns".format(strain_x.shape))

strain_y = strain_x['target']
strain_x.drop(['target'], axis=1, inplace=True)

stest_x = pd.read_csv("test_small.csv")
print("Small test set has {0[0]} rows and {0[1]} columns".format(stest_x.shape))

stest_y = stest_x['target']
stest_x.drop(['target'], axis=1, inplace=True)

Small training set has 1237 rows and 94 columns
Small test set has 3156 rows and 94 columns


### Test of GradientBoosting with small training and datasets

In [21]:
est = GradientBoostingClassifier(n_estimators = 200, max_features=1.0)
est.fit(strain_x, strain_y)

GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=3,
              max_features=1.0, max_leaf_nodes=None,
              min_impurity_split=1e-07, min_samples_leaf=1,
              min_samples_split=2, min_weight_fraction_leaf=0.0,
              n_estimators=200, presort='auto', random_state=None,
              subsample=1.0, verbose=0, warm_start=False)

In [22]:
y = est.predict(strain_x)
print(confusion_matrix(strain_y, y))
print(log_loss(strain_y, est.predict_proba(strain_x)))

[[ 38   0   0   0   0   0   0   0   0]
 [  0 322   0   0   0   0   0   0   0]
 [  0   4 156   0   0   0   0   0   0]
 [  0   0   0  54   0   0   0   0   0]
 [  0   0   0   0  55   0   0   0   0]
 [  0   0   0   0   0 283   0   0   0]
 [  0   0   0   0   0   0  57   0   0]
 [  0   0   0   0   0   0   0 169   0]
 [  0   0   0   0   0   0   0   0  99]]
0.0596637282383


### GridSearch for tuning hyperparameters

In [26]:
from sklearn.grid_search import GridSearchCV
from sklearn.cross_validation import StratifiedKFold

param_grid = {'learning_rate': [0.1, 0.05, 0.02, 0.01],
              'max_depth': [2, 4, 6],
              'min_samples_leaf': [3, 5, 9, 17],
              'max_features': [1.0, 0.7, 0.3, 0.1]}
est = GradientBoostingClassifier(n_estimators = 200)
sss = StratifiedShuffleSplit(strain_y, n_iter = 3, test_size=0.1)  # percentage of samples for each class is preserved
skf = StratifiedKFold(strain_y)                                    # percentage of samples for each class is preserved
gs_cv = GridSearchCV(est, param_grid, cv = 3, scoring = 'neg_log_loss')

In [None]:
gs_cv.fit(strain_x, strain_y)

In [312]:
print gs_cv.best_params_
print gs_cv.score(strain_x, strain_y)
print gs_cv.score(stest_x, stest_y)
gs_cv.grid_scores_

{'max_features': 0.7, 'learning_rate': 0.05, 'max_depth': 4, 'min_samples_leaf': 17}
-0.131446627936
-0.831515917566


[mean: -0.81436, std: 0.02329, params: {'max_features': 1.0, 'learning_rate': 0.1, 'max_depth': 2, 'min_samples_leaf': 3},
 mean: -0.81777, std: 0.05899, params: {'max_features': 1.0, 'learning_rate': 0.1, 'max_depth': 2, 'min_samples_leaf': 5},
 mean: -0.82465, std: 0.08921, params: {'max_features': 1.0, 'learning_rate': 0.1, 'max_depth': 2, 'min_samples_leaf': 9},
 mean: -0.78213, std: 0.05095, params: {'max_features': 1.0, 'learning_rate': 0.1, 'max_depth': 2, 'min_samples_leaf': 17},
 mean: -0.87916, std: 0.01566, params: {'max_features': 0.7, 'learning_rate': 0.1, 'max_depth': 2, 'min_samples_leaf': 3},
 mean: -0.89048, std: 0.06364, params: {'max_features': 0.7, 'learning_rate': 0.1, 'max_depth': 2, 'min_samples_leaf': 5},
 mean: -0.86750, std: 0.11402, params: {'max_features': 0.7, 'learning_rate': 0.1, 'max_depth': 2, 'min_samples_leaf': 9},
 mean: -0.86859, std: 0.09186, params: {'max_features': 0.7, 'learning_rate': 0.1, 'max_depth': 2, 'min_samples_leaf': 17},
 mean: -0.9012

In [314]:
df = pd.DataFrame(gs_cv.grid_scores_)
df.to_csv("GBGridSearch.csv")
# df.head()

In [287]:
sss = StratifiedShuffleSplit(labels, n_iter = 3, test_size = 0.1)
for train_index, test_index in sss:
    break

train_x, train_y = train.values[train_index], labels.values[train_index]
test_x, test_y = train.values[test_index], labels.values[test_index]

In [288]:
est_1 = GradientBoostingClassifier(n_estimators=1000, max_features=0.7, learning_rate=0.05, max_depth=4, min_samples_leaf=17);
est_1.fit(train_x, train_y)
est_2 = GradientBoostingClassifier(n_estimators=1000, max_features=None, learning_rate=0.05, max_depth=4, min_samples_leaf=17);
est_2.fit(train_x, train_y)

KeyboardInterrupt: 

In [298]:
test_score_1 = np.empty(len(est_1.estimators_))
train_score_1 = np.empty(len(est_1.estimators_))
for i, pred in enumerate(est_1.staged_predict_proba(test_x)):
    test_score_1[i] = log_loss(test_y, pred)
for i, pred in enumerate(est_1.staged_predict_proba(train_x)):
    train_score_1[i] = log_loss(train_y, pred)

In [305]:
plt.figure(1)
plt.plot(np.arange(len(est_1.estimators_)), test_score_1, label='Test')
plt.plot(np.arange(len(est_1.estimators_)), train_score_1, label='Train')
plt.legend();
plt.xlim(0, len(est_1.estimators_));
plt.xlabel('Number of trees')
plt.ylabel('LogLoss')
plt.title('Performance');
plt.grid(True)

In [299]:
test_score_2 = np.empty(len(est_2.estimators_))
train_score_2 = np.empty(len(est_2.estimators_))
for i, pred in enumerate(est_2.staged_predict_proba(test_x)):
    test_score_2[i] = log_loss(test_y, pred)
for i, pred in enumerate(est_2.staged_predict_proba(train_x)):
    train_score_2[i] = log_loss(train_y, pred)

In [304]:
plt.figure(2)
plt.plot(np.arange(len(est_2.estimators_)), test_score_2, label='Test')
plt.plot(np.arange(len(est_2.estimators_)), train_score_2, label='Train')
plt.legend();
plt.xlim(0, len(est_2.estimators_));
plt.xlabel('Number of trees')
plt.ylabel('LogLoss')
plt.title('Performance');
plt.grid(True)

In [296]:
print(log_loss(train_y, est_1.predict_proba(train_x)))
print(log_loss(train_y, est_2.predict_proba(train_x)))

0.247029867332
0.243521994492
