In [11]:
import pandas as pd
import numpy as np
import xgboost as xgb
from xgboost.sklearn import XGBClassifier
from sklearn import cross_validation,metrics
from sklearn.grid_search import GridSearchCV
import matplotlib.pylab as plt
%matplotlib inline

In [12]:
train = pd.read_csv('clean_train.csv')
target = pd.read_csv('outcome_train.csv')
train.shape

(26729, 11)

In [13]:
from sklearn.preprocessing import LabelEncoder,label_binarize
outcome_le = LabelEncoder()
outcome = outcome_le.fit_transform(target.values)
#outcome = label_binarize(outcome,classes = [0,1,2,3,4,])

  y = column_or_1d(y, warn=True)


Step 1: Fix learning rate and number of estimators for tuning tree-based parameters

In order to decide on boosting parameters, we need to set some initial values of other parameters. Lets take the following values:

max_depth = 5 : This should be between 3-10. I’ve started with 5 but you can choose a different number as well. 4-6 can be good starting points.

min_child_weight = 1 : A smaller value is chosen because it is a highly imbalanced class problem and leaf nodes can have smaller size groups.

gamma = 0 : A smaller value like 0.1-0.2 can also be chosen for starting. This will anyways be tuned later.

subsample, colsample_bytree = 0.8 : This is a commonly used used start value. Typical values range between 0.5-0.9.

scale_pos_weight = 1: Because of high class imbalance.

Lets take the default learning rate of 0.1 here and check the optimum number of trees using cv function of xgboost. 

In [14]:
xgb1 = XGBClassifier(
 learning_rate =0.1,
 n_estimators=1000,
 max_depth=5,
 min_child_weight=1,
 gamma=0,
 subsample=0.8,
 colsample_bytree=0.8,
 objective= 'multi:softprob',
 scale_pos_weight=1,
 seed=27)
xgb1 = xgb1.fit(train.values,outcome,eval_metric = 'mlogloss')

In [15]:
train_y_prediction = xgb1.predict(train.values)
train_y_perdprob = xgb1.predict_proba(train.values)

In [16]:
metrics.accuracy_score(outcome,train_y_prediction)

0.85584945190616935

In [17]:
param_test1 = {
 'max_depth':list(range(3,10,2)),
 'min_child_weight':list(range(1,6,2))
}
gsearch1 = GridSearchCV(estimator = XGBClassifier( learning_rate =0.1, n_estimators=140, max_depth=5,
 min_child_weight=1, gamma=0, subsample=0.8, colsample_bytree=0.8,
 objective= 'multi:softprob', scale_pos_weight=1, seed=27), 
 param_grid = param_test1, scoring='log_loss',iid=False, cv=5)

In [18]:
gsearch1.fit(train.values,outcome)
gsearch1.grid_scores_, gsearch1.best_params_, gsearch1.best_score_

([mean: -0.79702, std: 0.00753, params: {'min_child_weight': 1, 'max_depth': 3},
  mean: -0.79687, std: 0.00776, params: {'min_child_weight': 3, 'max_depth': 3},
  mean: -0.79709, std: 0.00786, params: {'min_child_weight': 5, 'max_depth': 3},
  mean: -0.77280, std: 0.00730, params: {'min_child_weight': 1, 'max_depth': 5},
  mean: -0.77317, std: 0.00725, params: {'min_child_weight': 3, 'max_depth': 5},
  mean: -0.77320, std: 0.00751, params: {'min_child_weight': 5, 'max_depth': 5},
  mean: -0.76207, std: 0.00814, params: {'min_child_weight': 1, 'max_depth': 7},
  mean: -0.76182, std: 0.00832, params: {'min_child_weight': 3, 'max_depth': 7},
  mean: -0.76272, std: 0.00780, params: {'min_child_weight': 5, 'max_depth': 7},
  mean: -0.76511, std: 0.00905, params: {'min_child_weight': 1, 'max_depth': 9},
  mean: -0.76307, std: 0.00976, params: {'min_child_weight': 3, 'max_depth': 9},
  mean: -0.76308, std: 0.01079, params: {'min_child_weight': 5, 'max_depth': 9}],
 {'max_depth': 7, 'min_chil

In [20]:
param_test2 = {
 'max_depth':[7,8,9],
 'min_child_weight':[2,3,4]
}
gsearch2 = GridSearchCV(estimator = XGBClassifier( learning_rate=0.1, n_estimators=140, max_depth=7,
 min_child_weight=3, gamma=0, subsample=0.8, colsample_bytree=0.8,
 objective= 'multi:softprob', nthread=4, scale_pos_weight=1,seed=27), 
 param_grid = param_test2, scoring='log_loss',iid=False, cv=5)
gsearch2.fit(train.values,outcome)
gsearch2.grid_scores_, gsearch2.best_params_, gsearch2.best_score_

([mean: -0.76159, std: 0.00762, params: {'min_child_weight': 2, 'max_depth': 7},
  mean: -0.76182, std: 0.00832, params: {'min_child_weight': 3, 'max_depth': 7},
  mean: -0.76252, std: 0.00739, params: {'min_child_weight': 4, 'max_depth': 7},
  mean: -0.76150, std: 0.00952, params: {'min_child_weight': 2, 'max_depth': 8},
  mean: -0.75991, std: 0.00929, params: {'min_child_weight': 3, 'max_depth': 8},
  mean: -0.76058, std: 0.00977, params: {'min_child_weight': 4, 'max_depth': 8},
  mean: -0.76475, std: 0.00934, params: {'min_child_weight': 2, 'max_depth': 9},
  mean: -0.76307, std: 0.00976, params: {'min_child_weight': 3, 'max_depth': 9},
  mean: -0.76290, std: 0.01037, params: {'min_child_weight': 4, 'max_depth': 9}],
 {'max_depth': 8, 'min_child_weight': 3},
 -0.75991461619198897)

best_params so far:
max_depth: 8
min_child_weight: 3
log_loss = -0.7618

In [29]:
param_test3 = {
 'gamma':[i/10.0 for i in range(0,5)]
}
gsearch3 = GridSearchCV(estimator = XGBClassifier( learning_rate =0.1, n_estimators=140, max_depth=8,
 min_child_weight=3, gamma=0, subsample=0.8, colsample_bytree=0.8,
 objective= 'multi:softprob', nthread=4, scale_pos_weight=1,seed=27), 
 param_grid = param_test3, scoring='log_loss',iid=False, cv=5)
gsearch3.fit(train.values,outcome)
gsearch3.grid_scores_, gsearch3.best_params_, gsearch3.best_score_

([mean: -0.75991, std: 0.00929, params: {'gamma': 0.0},
  mean: -0.76128, std: 0.00911, params: {'gamma': 0.1},
  mean: -0.76132, std: 0.00946, params: {'gamma': 0.2},
  mean: -0.76171, std: 0.01019, params: {'gamma': 0.3},
  mean: -0.76141, std: 0.01034, params: {'gamma': 0.4}],
 {'gamma': 0.0},
 -0.75991461619198897)

gamma tested:0~1.4, best:0.9
best score :0.7594

In [30]:
param_test4 = {
 'subsample':[i/100.0 for i in range(75,100,5)],
 'colsample_bytree':[i/100.0 for i in range(75,100,5)]
}
gsearch4 = GridSearchCV(estimator = XGBClassifier( learning_rate =0.1, n_estimators=140, max_depth=8,
 min_child_weight=3, gamma=0, subsample=0.8, colsample_bytree=0.8,
 objective= 'multi:softprob',nthread=4, scale_pos_weight=1,seed=27), 
 param_grid = param_test4, scoring='log_loss',iid=False, cv=5)
gsearch4.fit(train.values,outcome)
gsearch4.grid_scores_, gsearch4.best_params_, gsearch4.best_score_

([mean: -0.76267, std: 0.00985, params: {'subsample': 0.75, 'colsample_bytree': 0.75},
  mean: -0.75991, std: 0.00929, params: {'subsample': 0.8, 'colsample_bytree': 0.75},
  mean: -0.75950, std: 0.00841, params: {'subsample': 0.85, 'colsample_bytree': 0.75},
  mean: -0.76036, std: 0.00964, params: {'subsample': 0.9, 'colsample_bytree': 0.75},
  mean: -0.76006, std: 0.00899, params: {'subsample': 0.95, 'colsample_bytree': 0.75},
  mean: -0.76267, std: 0.00985, params: {'subsample': 0.75, 'colsample_bytree': 0.8},
  mean: -0.75991, std: 0.00929, params: {'subsample': 0.8, 'colsample_bytree': 0.8},
  mean: -0.75950, std: 0.00841, params: {'subsample': 0.85, 'colsample_bytree': 0.8},
  mean: -0.76036, std: 0.00964, params: {'subsample': 0.9, 'colsample_bytree': 0.8},
  mean: -0.76006, std: 0.00899, params: {'subsample': 0.95, 'colsample_bytree': 0.8},
  mean: -0.76383, std: 0.01022, params: {'subsample': 0.75, 'colsample_bytree': 0.85},
  mean: -0.76320, std: 0.00929, params: {'subsample'

best colsample_bytree:0.75
best subsample: 0.8
-0.75940338547688613

In [31]:
param_test5 = {
 'reg_alpha':[1e-5, 1e-2, 0.1, 1, 100],
 'reg_lambda':[1e-5, 1e-2, 0.1, 1, 100]
}
gsearch5 = GridSearchCV(estimator = XGBClassifier( learning_rate =0.1, n_estimators=140, max_depth=8,
 min_child_weight=3, gamma=0, subsample=0.85, colsample_bytree=0.75,
 objective= 'multi:softprob',nthread=4, scale_pos_weight=1,seed=27), 
 param_grid = param_test5, scoring='log_loss',iid=False, cv=5)
gsearch5.fit(train.values,outcome)
gsearch5.grid_scores_, gsearch5.best_params_, gsearch5.best_score_

([mean: -0.76175, std: 0.00956, params: {'reg_lambda': 1e-05, 'reg_alpha': 1e-05},
  mean: -0.76152, std: 0.00868, params: {'reg_lambda': 0.01, 'reg_alpha': 1e-05},
  mean: -0.76044, std: 0.00960, params: {'reg_lambda': 0.1, 'reg_alpha': 1e-05},
  mean: -0.75950, std: 0.00841, params: {'reg_lambda': 1, 'reg_alpha': 1e-05},
  mean: -0.77731, std: 0.00739, params: {'reg_lambda': 100, 'reg_alpha': 1e-05},
  mean: -0.76182, std: 0.00967, params: {'reg_lambda': 1e-05, 'reg_alpha': 0.01},
  mean: -0.76171, std: 0.00889, params: {'reg_lambda': 0.01, 'reg_alpha': 0.01},
  mean: -0.76108, std: 0.00988, params: {'reg_lambda': 0.1, 'reg_alpha': 0.01},
  mean: -0.76012, std: 0.00950, params: {'reg_lambda': 1, 'reg_alpha': 0.01},
  mean: -0.77732, std: 0.00732, params: {'reg_lambda': 100, 'reg_alpha': 0.01},
  mean: -0.76067, std: 0.00930, params: {'reg_lambda': 1e-05, 'reg_alpha': 0.1},
  mean: -0.76206, std: 0.00912, params: {'reg_lambda': 0.01, 'reg_alpha': 0.1},
  mean: -0.76086, std: 0.00953, 

In [144]:
param_test5 = {
 'reg_alpha':[1e-10, 1e-8, 1e-5,1e-3],
 'reg_lambda':[0.5, 1,25,50]
}
gsearch5 = GridSearchCV(estimator = XGBClassifier( learning_rate =0.1, n_estimators=140, max_depth=8,
 min_child_weight=1, gamma=0.9, subsample=0.8, colsample_bytree=0.75,
 objective= 'multi:softprob',nthread=4, scale_pos_weight=1,seed=27), 
 param_grid = param_test5, scoring='log_loss',n_jobs=4,iid=False, cv=5)
gsearch5.fit(train.values,outcome)
gsearch5.grid_scores_, gsearch5.best_params_, gsearch5.best_score_

descriptor assignment is deprecated. To maintain
the Fortran contiguity of a multidimensional Fortran
array, use 'a.T.view(...).T' instead
  obj_bytes_view = obj.view(self.np.uint8)
descriptor assignment is deprecated. To maintain
the Fortran contiguity of a multidimensional Fortran
array, use 'a.T.view(...).T' instead
  obj_bytes_view = obj.view(self.np.uint8)
descriptor assignment is deprecated. To maintain
the Fortran contiguity of a multidimensional Fortran
array, use 'a.T.view(...).T' instead
  obj_bytes_view = obj.view(self.np.uint8)
descriptor assignment is deprecated. To maintain
the Fortran contiguity of a multidimensional Fortran
array, use 'a.T.view(...).T' instead
  obj_bytes_view = obj.view(self.np.uint8)
descriptor assignment is deprecated. To maintain
the Fortran contiguity of a multidimensional Fortran
array, use 'a.T.view(...).T' instead
  obj_bytes_view = obj.view(self.np.uint8)
descriptor assignment is deprecated. To maintain
the Fortran contiguity of a multidimensio

([mean: -0.76045, std: 0.00730, params: {'reg_alpha': 1e-10, 'reg_lambda': 0.5},
  mean: -0.75940, std: 0.00696, params: {'reg_alpha': 1e-10, 'reg_lambda': 1},
  mean: -0.77397, std: 0.00769, params: {'reg_alpha': 1e-10, 'reg_lambda': 25},
  mean: -0.78058, std: 0.00809, params: {'reg_alpha': 1e-10, 'reg_lambda': 50},
  mean: -0.76045, std: 0.00730, params: {'reg_alpha': 1e-08, 'reg_lambda': 0.5},
  mean: -0.75940, std: 0.00696, params: {'reg_alpha': 1e-08, 'reg_lambda': 1},
  mean: -0.77397, std: 0.00769, params: {'reg_alpha': 1e-08, 'reg_lambda': 25},
  mean: -0.78058, std: 0.00809, params: {'reg_alpha': 1e-08, 'reg_lambda': 50},
  mean: -0.76070, std: 0.00732, params: {'reg_alpha': 1e-05, 'reg_lambda': 0.5},
  mean: -0.75940, std: 0.00696, params: {'reg_alpha': 1e-05, 'reg_lambda': 1},
  mean: -0.77405, std: 0.00781, params: {'reg_alpha': 1e-05, 'reg_lambda': 25},
  mean: -0.78058, std: 0.00809, params: {'reg_alpha': 1e-05, 'reg_lambda': 50},
  mean: -0.76090, std: 0.00712, params: 

'reg_alpha': 1e-05, 'reg_lambda': 1},
 -0.75940337024263005

In [32]:
xgb_tuned = XGBClassifier(learning_rate =0.01, n_estimators=5000, max_depth=8,
 min_child_weight=3, gamma=0, subsample=0.85, colsample_bytree=0.75,reg_alpha =1e-05, reg_lambda = 1, 
 objective= 'multi:softprob',nthread=4, scale_pos_weight=1,seed=27)
xgb_tuned = xgb_tuned.fit(train.values,outcome,eval_metric = 'mlogloss')

In [33]:
train_y_prediction = xgb_tuned.predict(train.values)
train_y_perdprob = xgb_tuned.predict_proba(train.values)

In [34]:
metrics.accuracy_score(outcome,train_y_prediction)

0.96707695761158297

In [155]:
train

array([ 0.53642768,  0.00335125,  0.23850651,  0.05440159,  0.16731298], dtype=float32)

([mean: 0.84479, std: 0.02617, params: {'min_child_weight': 1, 'max_depth': 3},
  mean: 0.84502, std: 0.02447, params: {'min_child_weight': 3, 'max_depth': 3},
  mean: 0.84678, std: 0.02293, params: {'min_child_weight': 5, 'max_depth': 3},
  mean: 0.84284, std: 0.02511, params: {'min_child_weight': 1, 'max_depth': 5},
  mean: 0.84399, std: 0.02894, params: {'min_child_weight': 3, 'max_depth': 5},
  mean: 0.84755, std: 0.02700, params: {'min_child_weight': 5, 'max_depth': 5},
  mean: 0.83168, std: 0.02229, params: {'min_child_weight': 1, 'max_depth': 7},
  mean: 0.83924, std: 0.02286, params: {'min_child_weight': 3, 'max_depth': 7},
  mean: 0.84213, std: 0.02765, params: {'min_child_weight': 5, 'max_depth': 7},
  mean: 0.82738, std: 0.02106, params: {'min_child_weight': 1, 'max_depth': 9},
  mean: 0.83525, std: 0.01870, params: {'min_child_weight': 3, 'max_depth': 9},
  mean: 0.84548, std: 0.02843, params: {'min_child_weight': 5, 'max_depth': 9}],
 {'max_depth': 5, 'min_child_weight': 5

([mean: 0.83887, std: 0.00801, params: {'min_child_weight': 1, 'max_depth': 3},
  mean: 0.83946, std: 0.00887, params: {'min_child_weight': 3, 'max_depth': 3},
  mean: 0.83904, std: 0.00746, params: {'min_child_weight': 5, 'max_depth': 3},
  mean: 0.84921, std: 0.00670, params: {'min_child_weight': 1, 'max_depth': 5},
  mean: 0.84857, std: 0.00750, params: {'min_child_weight': 3, 'max_depth': 5},
  mean: 0.84948, std: 0.00621, params: {'min_child_weight': 5, 'max_depth': 5},
  mean: 0.84923, std: 0.00802, params: {'min_child_weight': 1, 'max_depth': 7},
  mean: 0.85145, std: 0.00546, params: {'min_child_weight': 3, 'max_depth': 7},
  mean: 0.85128, std: 0.00743, params: {'min_child_weight': 5, 'max_depth': 7},
  mean: 0.84574, std: 0.00627, params: {'min_child_weight': 1, 'max_depth': 9},
  mean: 0.84935, std: 0.00600, params: {'min_child_weight': 3, 'max_depth': 9},
  mean: 0.84844, std: 0.00626, params: {'min_child_weight': 5, 'max_depth': 9}],
 {'max_depth': 7, 'min_child_weight': 3

In [38]:
test = pd.read_csv('clean_test_bc.csv')
test_prediction = xgb_tuned.predict_proba(test.values)

In [40]:
test_prediction = pd.DataFrame(test_prediction)
test_prediction.columns = ['Adoption','Died','Euthanasia','Return_to_owner', 'Transfer']
ID = pd.read_csv('test.csv')
test_prediction['ID'] = ID['ID']
test_prediction = test_prediction[['ID', 'Adoption','Died','Euthanasia','Return_to_owner', 'Transfer']]
test_prediction.to_csv('xgb_prediction_tuned_bc.csv',index = False)

In [41]:
test_prediction.head()

Unnamed: 0,ID,Adoption,Died,Euthanasia,Return_to_owner,Transfer
0,1,0.00887,0.000364,0.058387,0.144876,0.787503
1,2,0.734765,2.3e-05,0.005155,0.208538,0.051518
2,3,0.521224,0.000132,0.004207,0.338722,0.135715
3,4,0.79839,7.3e-05,0.007692,0.133525,0.060319
4,5,0.677184,3.3e-05,0.00014,0.305116,0.017526


In [45]:
total.head()

Unnamed: 0,AnimalType,AgeCode,ColorCode,BreedCode,IntactOrNot,NameCode,Hour,Weekday,Day of Month,Month,Year,OutcomeType
0,1,1,0,0,1.0,1,18,2,12,2,2014,Return_to_owner
1,0,1,0,0,1.0,1,12,6,13,10,2013,Euthanasia
2,1,1,0,0,1.0,1,12,5,31,1,2015,Adoption
3,0,0,0,0,0.0,0,19,4,11,7,2014,Transfer
4,1,1,1,0,1.0,0,12,4,15,11,2013,Transfer


In [35]:
train.head()

Unnamed: 0,AnimalType,AgeCode,IntactOrNot,NameCode,Hour,Weekday,Day of Month,Month,Year,Color,Breed
0,1,1,1.0,1,18,2,12,2,2014,15,46
1,0,1,1.0,1,12,6,13,10,2013,26,27
2,1,1,1.0,1,12,5,31,1,2015,7,40
3,0,0,0.0,0,19,4,11,7,2014,8,27
4,1,1,1.0,0,12,4,15,11,2013,49,33


In [89]:
outcome.shape

(26729, 5)