In [1]:
import numpy as np
from sklearn import *
import mltools as ml
import matplotlib.pyplot as plt
np.random.seed(0)

In [2]:
X = np.genfromtxt('./X_train.txt', delimiter=',')
Y = np.genfromtxt('./Y_train.txt', delimiter=',')
X_test = np.genfromtxt('./X_test.txt', delimiter=',')
X,Y = ml.shuffleData(X,Y)
Xtr,Xva,Ytr,Yva = ml.splitData(X,Y,0.75)

In [3]:
#validation error is not constant, but approximatly close enough to tell which is the best
learning_rates = [1, 0.5, 0.25, 0.1, 0.05, 0.01]
learning_TrErrors = []
learning_VaErrors = []
for i in learning_rates:
    gradient_boost = ensemble.GradientBoostingClassifier(learning_rate=i)
    gradient_boost.fit(Xtr,Ytr)
    print("learning_rates: " + str(i))
    TrError = 1-(gradient_boost.score(Xtr, Ytr))
    learning_TrErrors.append(TrError)
    VaError = 1-(gradient_boost.score(Xva, Yva))
    learning_VaErrors.append(VaError)
    print("training error: " + str(TrError))
    print("validation error: " + str(VaError))
    gradient_boost_roc = metrics.roc_auc_score(Yva, gradient_boost.predict_proba(Xva)[:,1])
    print("Roc: " + str(gradient_boost_roc))
    print("")
print("Minimum Validation Error: " +  str(min(learning_VaErrors)) +
      " at learning_rate of: " + str(learning_rates[learning_VaErrors.index(min(learning_VaErrors))]))

learning_rates: 1
training error: 0.08855757140290998
validation error: 0.36476293103448276
Roc: 0.6793845617905768

learning_rates: 0.5
training error: 0.1140650260463445
validation error: 0.3545258620689655
Roc: 0.7098243678694807

learning_rates: 0.25
training error: 0.17082809412610023
validation error: 0.34967672413793105
Roc: 0.7294533631375737

learning_rates: 0.1
training error: 0.2394467397161847
validation error: 0.3313577586206896
Roc: 0.7437512700670594

learning_rates: 0.05
training error: 0.2881264594934435
validation error: 0.3351293103448276
Roc: 0.7470897320521381

learning_rates: 0.01
training error: 0.34524878749775467
validation error: 0.35290948275862066
Roc: 0.7409562516329433

Minimum Validation Error: 0.3313577586206896 at learning_rate of: 0.1


In [4]:
n_estimators = range(10,110,10)
n_estimators_VaErrors=[]
n_estimators_TrErrors=[]
for i in range(len(n_estimators)):
    gradient_boost = ensemble.GradientBoostingClassifier(learning_rate=0.1,n_estimators=n_estimators[i])
    gradient_boost.fit(Xtr,Ytr)
    print("n_estimators: "+ str(n_estimators[i]))
    TrError = 1-(gradient_boost.score(Xtr, Ytr))
    n_estimators_TrErrors.append(TrError)
    VaError=(1 - gradient_boost.score(Xva, Yva))
    n_estimators_VaErrors.append(VaError)
    print("training error: " + str(TrError))
    print("validation error: " + str(VaError))
    gradient_boost_roc = metrics.roc_auc_score(Yva, gradient_boost.predict_proba(Xva)[:,1])
    print("Roc: " + str(gradient_boost_roc))
    print("")
print("Minimum Validation Error: " +  str(min(n_estimators_VaErrors)) + 
      " at n_estimator of: " + str(n_estimators[n_estimators_VaErrors.index(min(n_estimators_VaErrors))]))

n_estimators: 10
training error: 0.34722471708280944
validation error: 0.3469827586206896
Roc: 0.7404412575841147

n_estimators: 20
training error: 0.32530986168492904
validation error: 0.3448275862068966
Roc: 0.7442912300055158

n_estimators: 30
training error: 0.3120172444763787
validation error: 0.3469827586206896
Roc: 0.7463750108862892

n_estimators: 40
training error: 0.3005209268906054
validation error: 0.3405172413793104
Roc: 0.7474160304235491

n_estimators: 50
training error: 0.28740793964433264
validation error: 0.33782327586206895
Roc: 0.7459354951084275

n_estimators: 60
training error: 0.27375606251122686
validation error: 0.33459051724137934
Roc: 0.7468569106163092

n_estimators: 70
training error: 0.2663912340578408
validation error: 0.3286637931034483
Roc: 0.745648677678753

n_estimators: 80
training error: 0.25866714567989946
validation error: 0.3313577586206896
Roc: 0.7463581734258427

n_estimators: 90
training error: 0.2487874977546255
validation error: 0.3329741379

In [5]:
# the validation errors aren't constant, but the difference between the depth values are big enough on average it
# doesn't matter 
depths = range(1,11,1)
depth_TrErrors =[]
depth_VaErrors =[]
for d in depths:
    gradient_boost = ensemble.GradientBoostingClassifier(learning_rate=0.1, n_estimators= 70, max_depth=d)
    gradient_boost.fit(Xtr,Ytr)
    print("depth: " + str(d))
    TrError = 1-(gradient_boost.score(Xtr, Ytr))
    depth_TrErrors.append(TrError)
    VaError = 1-(gradient_boost.score(Xva, Yva))
    depth_VaErrors.append(VaError)
    print("training error: " + str(TrError))
    print("validation error: " + str(VaError))
    gradient_boost_roc = metrics.roc_auc_score(Yva, gradient_boost.predict_proba(Xva)[:,1])
    print("Roc: " + str(gradient_boost_roc))
    print("")
print("Minimum Validation Error: " +  str(min(depth_VaErrors)) + 
      " at max_depth of: " + str(depths[depth_VaErrors.index(min(depth_VaErrors))]))

depth: 1
training error: 0.34955990659241964
validation error: 0.35721982758620685
Roc: 0.7277347809678637

depth: 2
training error: 0.31866355308065386
validation error: 0.3475215517241379
Roc: 0.7467251139431591

depth: 3
training error: 0.2663912340578408
validation error: 0.3286637931034483
Roc: 0.745589456266148

depth: 4
training error: 0.20711334650619728
validation error: 0.3399784482758621
Roc: 0.7399134902894301

depth: 5
training error: 0.15735584695527216
validation error: 0.3475215517241379
Roc: 0.7295996748628328

depth: 6
training error: 0.08137237291180166
validation error: 0.34913793103448276
Roc: 0.7304636107643626

depth: 7
training error: 0.03951859170109573
validation error: 0.34375
Roc: 0.72944755711673

depth: 8
training error: 0.014190767019938932
validation error: 0.34159482758620685
Roc: 0.7281597816936163

depth: 9
training error: 0.003233339320998785
validation error: 0.34913793103448276
Roc: 0.7149208929660058

depth: 10
training error: 0.0
validation error

In [6]:
nodes = range(2, 32, 2)
node_TrErrors =[0]*len(nodes)
node_VaErrors =[0]*len(nodes)
for i in range(10):
    for n in range(len(nodes)):
        gradient_boost = ensemble.GradientBoostingClassifier(learning_rate=0.1, n_estimators= 70, max_depth= 3, 
                                                             max_leaf_nodes=nodes[n])
        gradient_boost.fit(Xtr,Ytr)
        TrError = 1-(gradient_boost.score(Xtr, Ytr))
        node_TrErrors[n] = node_TrErrors[n] + TrError
        VaError = 1-(gradient_boost.score(Xva, Yva))
        node_VaErrors[n] = node_VaErrors[n] + VaError
        gradient_boost_roc = metrics.roc_auc_score(Yva, gradient_boost.predict_proba(Xva)[:,1])

In [7]:
for n in range(len(node_TrErrors)):
    node_TrErrors[n] = node_TrErrors[n]/10
    node_VaErrors[n] = node_VaErrors[n]/10
    print("max_leaf_nodes: " + str(nodes[n]))
    print("training error: " + str(node_TrErrors[n]))
    print("validation error: " + str(node_VaErrors[n]))
    print("")
print("Minimum Validation Error: " +  str(min(node_VaErrors)) + 
      " at max_leaf_nodes of: " + str(nodes[node_VaErrors.index(min(node_VaErrors))]))

max_leaf_nodes: 2
training error: 0.3495599065924196
validation error: 0.35721982758620685

max_leaf_nodes: 4
training error: 0.3084246452308244
validation error: 0.33890086206896547

max_leaf_nodes: 6
training error: 0.27698940183222565
validation error: 0.34428879310344834

max_leaf_nodes: 8
training error: 0.2504041674151249
validation error: 0.33243534482758624

max_leaf_nodes: 10
training error: 0.2326208011496318
validation error: 0.3360991379310345

max_leaf_nodes: 12
training error: 0.2155559547332495
validation error: 0.33911637931034483

max_leaf_nodes: 14
training error: 0.20657445661936413
validation error: 0.3367995689655172

max_leaf_nodes: 16
training error: 0.20716723549488053
validation error: 0.3413793103448276

max_leaf_nodes: 18
training error: 0.20786779234776365
validation error: 0.3423491379310345

max_leaf_nodes: 20
training error: 0.20670019759295855
validation error: 0.33992456896551726

max_leaf_nodes: 22
training error: 0.20671816058918627
validation error: 

Best parameters based on Validation error:<br>
&emsp;learning_rate = 0.1<br>
&emsp;n_estimators = 70<br>
&emsp;max_depth = 3<br>
&emsp;max_leaf_nodes = 8

In [8]:
gradient_boost = ensemble.GradientBoostingClassifier(learning_rate=0.1, n_estimators=70, max_depth=3, max_leaf_nodes=8)
gradient_boost.fit(Xtr,Ytr)
print(gradient_boost)
gradient_boost_roc = metrics.roc_auc_score(Yva, gradient_boost.predict_proba(Xva)[:,1])
print("Training error: " + str(1-(gradient_boost.score(Xtr, Ytr))))
print("Validation error: " + str(1-(gradient_boost.score(Xva, Yva))))
print("Roc: " + str(gradient_boost_roc))

GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=8,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=70,
              n_iter_no_change=None, presort='auto', random_state=None,
              subsample=1.0, tol=0.0001, validation_fraction=0.1,
              verbose=0, warm_start=False)
Training error: 0.25040416741512483
Validation error: 0.3324353448275862
Roc: 0.7455122361889279
