In [1]:
import numpy as np
from sklearn import *
import mltools as ml
import matplotlib.pyplot as plt
np.random.seed(0)

In [2]:
X = np.genfromtxt('./X_train.txt', delimiter=',')
Y = np.genfromtxt('./Y_train.txt', delimiter=',')
X_test = np.genfromtxt('./X_test.txt', delimiter=',')
X,Y = ml.shuffleData(X,Y)
Xtr,Xva,Ytr,Yva = ml.splitData(X,Y,0.75)

In [3]:
#validation error is not constant, but approximatly close enough to tell which is the best
learning_rates = [1, 0.5, 0.25, 0.1, 0.05, 0.01]
learning_TrErrors = []
learning_VaErrors = []
for i in learning_rates:
    gradient_boost = ensemble.GradientBoostingClassifier(learning_rate=i)
    gradient_boost.fit(Xtr,Ytr)
    print("learning_rates: " + str(i))
    TrError = 1-(gradient_boost.score(Xtr, Ytr))
    learning_TrErrors.append(TrError)
    VaError = 1-(gradient_boost.score(Xva, Yva))
    learning_VaErrors.append(VaError)
    print("training error: " + str(TrError))
    print("validation error: " + str(VaError))
    gradient_boost_roc = metrics.roc_auc_score(Yva, gradient_boost.predict_proba(Xva)[:,1])
    print("Roc: " + str(gradient_boost_roc))
    print("")
print("Minimum Validation Error: " +  str(min(learning_VaErrors)) +
      " at learning_rate of: " + str(learning_rates[learning_VaErrors.index(min(learning_VaErrors))]))

learning_rates: 1
training error: 0.08855757140290998
validation error: 0.36476293103448276
Roc: 0.6793845617905768

learning_rates: 0.5
training error: 0.1140650260463445
validation error: 0.3545258620689655
Roc: 0.7098243678694807

learning_rates: 0.25
training error: 0.17082809412610023
validation error: 0.34967672413793105
Roc: 0.7294533631375737

learning_rates: 0.1
training error: 0.2394467397161847
validation error: 0.3313577586206896
Roc: 0.7437512700670594

learning_rates: 0.05
training error: 0.2881264594934435
validation error: 0.3351293103448276
Roc: 0.7470897320521381

learning_rates: 0.01
training error: 0.34524878749775467
validation error: 0.35290948275862066
Roc: 0.7409562516329433

Minimum Validation Error: 0.3313577586206896 at learning_rate of: 0.1


In [4]:
n_estimators = range(10,110,10)
n_estimators_VaErrors=[]
n_estimators_TrErrors=[]
for i in range(len(n_estimators)):
    gradient_boost = ensemble.GradientBoostingClassifier(n_estimators=n_estimators[i])
    gradient_boost.fit(Xtr,Ytr)
    print("n_estimators: "+ str(n_estimators[i]))
    TrError = 1-(gradient_boost.score(Xtr, Ytr))
    n_estimators_TrErrors.append(TrError)
    VaError=(1 - gradient_boost.score(Xva, Yva))
    n_estimators_VaErrors.append(VaError)
    print("training error: " + str(TrError))
    print("validation error: " + str(VaError))
    gradient_boost_roc = metrics.roc_auc_score(Yva, gradient_boost.predict_proba(Xva)[:,1])
    print("Roc: " + str(gradient_boost_roc))
    print("")
print("Minimum Validation Error: " +  str(min(n_estimators_VaErrors)) + 
      " at n_estimator of: " + str(n_estimators[n_estimators_VaErrors.index(min(n_estimators_VaErrors))]))

n_estimators: 10
training error: 0.34722471708280944
validation error: 0.3469827586206896
Roc: 0.7404412575841147

n_estimators: 20
training error: 0.32530986168492904
validation error: 0.3448275862068966
Roc: 0.7442912300055158

n_estimators: 30
training error: 0.3120172444763787
validation error: 0.3469827586206896
Roc: 0.7463750108862892

n_estimators: 40
training error: 0.3005209268906054
validation error: 0.3405172413793104
Roc: 0.7474160304235491

n_estimators: 50
training error: 0.28740793964433264
validation error: 0.33782327586206895
Roc: 0.7459354951084275

n_estimators: 60
training error: 0.27375606251122686
validation error: 0.33459051724137934
Roc: 0.7468569106163092

n_estimators: 70
training error: 0.2663912340578408
validation error: 0.3286637931034483
Roc: 0.745648677678753

n_estimators: 80
training error: 0.25866714567989946
validation error: 0.3313577586206896
Roc: 0.7463581734258427

n_estimators: 90
training error: 0.2487874977546255
validation error: 0.3329741379

In [5]:
# the validation errors aren't constant, but the difference between the depth values are big enough on average it
# doesn't matter 
depths = range(1,11,1)
depth_TrErrors =[]
depth_VaErrors =[]
for d in depths:
    gradient_boost = ensemble.GradientBoostingClassifier(max_depth=d)
    gradient_boost.fit(Xtr,Ytr)
    print("depth: " + str(d))
    TrError = 1-(gradient_boost.score(Xtr, Ytr))
    depth_TrErrors.append(TrError)
    VaError = 1-(gradient_boost.score(Xva, Yva))
    depth_VaErrors.append(VaError)
    print("training error: " + str(TrError))
    print("validation error: " + str(VaError))
    gradient_boost_roc = metrics.roc_auc_score(Yva, gradient_boost.predict_proba(Xva)[:,1])
    print("Roc: " + str(gradient_boost_roc))
    print("")
print("Minimum Validation Error: " +  str(min(depth_VaErrors)) + 
      " at max_depth of: " + str(depths[depth_VaErrors.index(min(depth_VaErrors))]))

depth: 1
training error: 0.34488952757319924
validation error: 0.3582974137931034
Roc: 0.7318709902168548

depth: 2
training error: 0.3021375965511047
validation error: 0.3448275862068966
Roc: 0.7477312973553574

depth: 3
training error: 0.2394467397161847
validation error: 0.3313577586206896
Roc: 0.7436223764043314

depth: 4
training error: 0.1681336446919346
validation error: 0.3507543103448276
Roc: 0.7377791970273174

depth: 5
training error: 0.10957427698940181
validation error: 0.3512931034482759
Roc: 0.7253519900136443

depth: 6
training error: 0.049577869588647405
validation error: 0.3550646551724138
Roc: 0.7264110082155195

depth: 7
training error: 0.021375965511047257
validation error: 0.34967672413793105
Roc: 0.725142973263274

depth: 8
training error: 0.0030537093587210684
validation error: 0.34967672413793105
Roc: 0.7253496676053067

depth: 9
training error: 0.0003592599245554329
validation error: 0.34536637931034486
Roc: 0.7161680262432143

depth: 10
training error: 0.0
va

In [6]:
nodes = range(2, 32, 2)
node_TrErrors =[0]*len(nodes)
node_VaErrors =[0]*len(nodes)
for i in range(10):
    for n in range(len(nodes)):
        gradient_boost = ensemble.GradientBoostingClassifier(max_leaf_nodes=nodes[n])
        gradient_boost.fit(Xtr,Ytr)
        TrError = 1-(gradient_boost.score(Xtr, Ytr))
        node_TrErrors[n] = node_TrErrors[n] + TrError
        VaError = 1-(gradient_boost.score(Xva, Yva))
        node_VaErrors[n] = node_VaErrors[n] + VaError
        gradient_boost_roc = metrics.roc_auc_score(Yva, gradient_boost.predict_proba(Xva)[:,1])

In [7]:
print(node_VaErrors)
for n in range(len(node_TrErrors)):
    node_TrErrors[n] = node_TrErrors[n]/10
    node_VaErrors[n] = node_VaErrors[n]/10
    print("max_leaf_nodes: " + str(nodes[n]))
    print("training error: " + str(node_TrErrors[n]))
    print("validation error: " + str(node_VaErrors[n]))
    print("")
print("Minimum Validation Error: " +  str(min(node_VaErrors)) + 
      " at max_leaf_nodes of: " + str(nodes[node_VaErrors.index(min(node_VaErrors))]))

[3.582974137931034, 3.426724137931034, 3.389008620689655, 3.355064655172414, 3.4003232758620694, 3.4709051724137927, 3.4692887931034475, 3.4261853448275863, 3.432112068965517, 3.446659482758621, 3.4218750000000004, 3.422952586206896, 3.4224137931034484, 3.44989224137931, 3.452047413793104]
max_leaf_nodes: 2
training error: 0.3448895275731992
validation error: 0.35829741379310337

max_leaf_nodes: 4
training error: 0.29046164900305366
validation error: 0.34267241379310337

max_leaf_nodes: 6
training error: 0.25004490749056935
validation error: 0.33890086206896547

max_leaf_nodes: 8
training error: 0.22543560265852353
validation error: 0.3355064655172414

max_leaf_nodes: 10
training error: 0.20603556673253104
validation error: 0.34003232758620694

max_leaf_nodes: 12
training error: 0.18394108137237283
validation error: 0.3470905172413793

max_leaf_nodes: 14
training error: 0.18034848212681878
validation error: 0.3469288793103448

max_leaf_nodes: 16
training error: 0.17275013472247172
vali

Best parameters based on Validation error:<br>
&emsp;learning_rate = 0.1<br>
&emsp;n_estimators = 70<br>
&emsp;max_depth = 3<br>
&emsp;max_leaf_nodes = 8

In [8]:
gradient_boost = ensemble.GradientBoostingClassifier(learning_rate=0.1, n_estimators=70, max_depth=3, max_leaf_nodes=8)
gradient_boost.fit(Xtr,Ytr)
print(gradient_boost)
gradient_boost_roc = metrics.roc_auc_score(Yva, gradient_boost.predict_proba(Xva)[:,1])
print("Training error: " + str(1-(gradient_boost.score(Xtr, Ytr))))
print("Validation error: " + str(1-(gradient_boost.score(Xva, Yva))))
print("Roc: " + str(gradient_boost_roc))

Training error: 0.25040416741512483
Validation error: 0.3324353448275862
Roc: 0.7455122361889279
