In [1]:
import xgboost as xgb
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn import preprocessing, pipeline, metrics, grid_search, cross_validation
import time
import random
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

###### Week 3 we will be focused on XGBoost tuning and will be using engineered features from week 2 which by the end of this notebook will lead you to a score between 0.736-0.739. But feel free to use whichever features that  work for you.

In [2]:
# load train and test data and output full data and column dictionary
def load_data():
    # Load data
    start = time.time() 
    train = pd.read_csv('./data/train.csv', parse_dates=['DateTime'])
    print ("Loading train data finished in %0.3fs" % (time.time() - start))        

    test = pd.read_csv('./data/test.csv', parse_dates=['DateTime'])
    print ("Loading test data finished in %0.3fs" % (time.time() - start))  

    # seperate column names by categorical, numerical, label, and id
    data_types = train.dtypes  
    categorical_cols = list(data_types[data_types=='object'].index) + list(data_types[data_types=='datetime64[ns]'].index)
    numerical_cols = list(data_types[data_types=='int64'].index) + list(data_types[data_types=='float64'].index)

    dict_cols = dict()
    # categorical columns
    dict_cols['categorical_cols'] = categorical_cols
    dict_cols['categorical_cols'].remove('AnimalID') # remove ids
    dict_cols['categorical_cols'].remove('OutcomeType') # remove labels
    dict_cols['categorical_cols'].remove('OutcomeSubtype')

    # numeric columns
    dict_cols['numerical_cols'] = numerical_cols

    # id columns
    dict_cols['id_col'] = ['AnimalID']

    # label columns
    dict_cols['label_col'] = ['OutcomeType']
    
    # Merge train and test
    dict_cols['train_size'] = train.shape[0]
    dict_cols['test_size'] = test.shape[0]
    print 'train data size: %s' % dict_cols['train_size']
    print 'test data size: %s' % dict_cols['test_size']

    full_data = pd.concat([train[dict_cols['categorical_cols'] + dict_cols['numerical_cols'] + dict_cols['label_col']]
                         , test[dict_cols['categorical_cols'] + dict_cols['numerical_cols']]
                        ])
    
    return dict_cols, full_data

In [3]:
## function to grid search the best model
def search_model(train_x, train_y, est, param_grid, n_jobs, cv, refit=False):
    model = grid_search.GridSearchCV(estimator  = est, 
                                     param_grid = param_grid, 
                                     scoring = 'log_loss', 
                                     verbose = 10, 
                                     n_jobs  = n_jobs, 
                                     iid = True, # is identically distributed 
                                     refit = refit,
                                     cv = cv)
    # Fit Grid Search Model
    model.fit(train_x, train_y)
    print("Best score: %0.3f" % model.best_score_)
    print("Best parameters set:", model.best_params_)
    print("Scores:", model.grid_scores_)
    return model

In [4]:
# function to plot feature importance
def plot_feature_importance(feature_importances, feature_names):
    ftr_imp_df = pd.DataFrame(sorted(zip(feature_names, feature_importances)
                          , key=lambda x: x[1], reverse = False)
                   )
    y_pos = np.arange(ftr_imp_df.shape[0])

    plt.barh(y_pos, ftr_imp_df[1], align='center', alpha=0.4)
    plt.yticks(y_pos, ftr_imp_df[0])
    plt.xlabel('Feature Importance')

    plt.show()

In [5]:
# function to label encoding, and output label mapping
def label_encoding(cols, full_data):
    label_mapping = {}
    for col in cols:
        print("Label encoding column: %s" % (col))
        LBL = preprocessing.LabelEncoder()
        LBL.fit(full_data[col])
        if col == 'OutcomeType':
            label_mapping = dict(zip(full_data['OutcomeType'].unique(), LBL.transform(full_data['OutcomeType'].unique())))
#             print("Label mapping: %s" % (label_mapping))
        full_data[col] = LBL.transform(full_data[col])
        
    return label_mapping

In [6]:
# function to transform ages into days
def age2days(age):
    age = str(age)
    if 'day' in age:
        days = int(age.split(' ')[0])
    if 'week' in age:
        days= int(age.split(' ')[0])*7
    if 'month' in age:
        days = int(age.split(' ')[0])*30
    if 'year' in age:
        days = int(age.split(' ')[0])*365    
    else:
        days = 0
    return days   

In [5]:
# Load data
dict_cols, full_data = load_data()

Loading train data finished in 0.091s
Loading test data finished in 0.123s
train data size: 26729
test data size: 11456


In [11]:
full_data['AgeuponOutcome'] = full_data['AgeuponOutcome'].apply(age2days)
full_data['Breed'] = full_data['Breed'].apply(lambda x: '-'.join(sorted(list(set(x.split("/"))))))
full_data['NameLength'] = full_data['Name'].apply(lambda x: len(str(x)) if x else 0)

In [12]:
dict_cols['categorical_cols'].remove('AgeuponOutcome')
dict_cols['numerical_cols'].append('AgeuponOutcome')
dict_cols['numerical_cols'].append('NameLength')

In [13]:
full_data['year'] = full_data['DateTime'].dt.year
full_data['month'] = full_data['DateTime'].dt.month
full_data['day'] = full_data['DateTime'].dt.day
full_data['weekday'] = full_data['DateTime'].dt.dayofweek
full_data['weekyear'] = full_data['DateTime'].dt.weekofyear
full_data['hour'] = full_data['DateTime'].dt.hour

In [14]:
dict_cols['numerical_cols'].append('year')
dict_cols['numerical_cols'].append('month')
dict_cols['numerical_cols'].append('day')
dict_cols['numerical_cols'].append('weekday')
dict_cols['numerical_cols'].append('weekyear')
dict_cols['numerical_cols'].append('hour') 

In [15]:
full_data["DateTime"] = full_data["DateTime"].apply(lambda x:str(x))

In [25]:
label_encoding(dict_cols['categorical_cols'] + dict_cols['label_col'], full_data)

Label encoding column: Name
Label encoding column: AnimalType
Label encoding column: SexuponOutcome
Label encoding column: Breed
Label encoding column: Color
Label encoding column: DateTime
Label encoding column: OutcomeType


{nan: 0,
 'Adoption': 1,
 'Died': 2,
 'Euthanasia': 3,
 'Return_to_owner': 4,
 'Transfer': 5}

In [26]:
full_data.dtypes

AgeuponOutcome    int64
AnimalType        int64
Breed             int64
Color             int64
DateTime          int64
Name              int64
OutcomeType       int64
SexuponOutcome    int64
NameLength        int64
year              int64
month             int64
day               int64
weekday           int64
weekyear          int64
hour              int64
dtype: object

In [27]:
train_X = full_data[:dict_cols['train_size']][dict_cols['categorical_cols'] + dict_cols['numerical_cols']].fillna(-999).values
train_y = full_data[:dict_cols['train_size']][dict_cols['label_col']].fillna(-999).values.reshape(dict_cols['train_size'])

### XGBoost tuning
###### Here we are using a wrapped functions instead of the XGBoost classifier

In [22]:
def xgb_train(trainX,trainY,params):
#XGBoost wrapper - to enable early stopping and missing value
    plst = list(params.items())
    offset = int(trainX.shape[0]*0.08)
    num_rounds = 10000
    xgtrain = xgb.DMatrix(trainX[offset:,:], label=trainY[offset:], missing=-999)
    xgval = xgb.DMatrix(trainX[:offset,:], label=trainY[:offset], missing=-999)

    #train using early stopping and predict
    watchlist = [(xgtrain, 'train'),(xgval, 'val')]
    model = xgb.train(params=plst, dtrain=xgtrain, num_boost_round=num_rounds, evals=watchlist,  early_stopping_rounds=100)
    print ("Best score:", model.best_score)
    print ("Best iteration:", model.best_iteration)
    return model

def xgb_pred(model,testX,params):
#XGBoost wrapper 
    xgtest = xgb.DMatrix(testX, missing=-999)
    preds = model.predict(xgtest,ntree_limit=model.best_iteration)
    return preds

#### Step 1 - choose a relatively larger learning rate(0.02) then use early stopping for optimal rounds
###### Initial parameters (works well for most cases)

  learning rate: 0.02
  
  max_depth: 6
  
  
  min_child_weight: 1
  
  
  colsample_bytree: 0.7
  
  
  subsample : 0.7

In [28]:
params = {"objective": "multi:softprob"
          , "num_class": 6
          , "booster":  "gbtree"
          , "eval_metric":  "mlogloss"
          , "eta": 0.02
          , "subsample": 0.7
          , "colsample_bytree": 0.7
          , "max_depth": 6
          , "min_child_weight": 1
          , "seed" : 1234
         }

model = xgb_train(train_X, train_y, params)

Will train until val error hasn't decreased in 100 rounds.
[0]	train-mlogloss:1.767655	val-mlogloss:1.768356
[1]	train-mlogloss:1.744993	val-mlogloss:1.746305
[2]	train-mlogloss:1.724192	val-mlogloss:1.726164
[3]	train-mlogloss:1.702380	val-mlogloss:1.704886
[4]	train-mlogloss:1.681083	val-mlogloss:1.684289
[5]	train-mlogloss:1.658807	val-mlogloss:1.662463
[6]	train-mlogloss:1.637838	val-mlogloss:1.641897
[7]	train-mlogloss:1.620651	val-mlogloss:1.625250
[8]	train-mlogloss:1.602389	val-mlogloss:1.607617
[9]	train-mlogloss:1.584257	val-mlogloss:1.590029
[10]	train-mlogloss:1.565539	val-mlogloss:1.571700
[11]	train-mlogloss:1.548284	val-mlogloss:1.554888
[12]	train-mlogloss:1.532859	val-mlogloss:1.540049
[13]	train-mlogloss:1.517433	val-mlogloss:1.525140
[14]	train-mlogloss:1.501511	val-mlogloss:1.509682
[15]	train-mlogloss:1.487196	val-mlogloss:1.495976
[16]	train-mlogloss:1.473337	val-mlogloss:1.482625
[17]	train-mlogloss:1.458890	val-mlogloss:1.468683
[18]	train-mlogloss:1.444728	val-

('Best score:', 0.782027)
('Best iteration:', 1021)


[1121]	train-mlogloss:0.512219	val-mlogloss:0.782342
Stopping. Best iteration:
[1021]	train-mlogloss:0.528730	val-mlogloss:0.782027



Cross validate for base line performance

In [31]:
param_grid = {"objective": ["multi:softprob"]
              , "learning_rate": [0.02]
              , "max_depth": [6]
              , "min_child_weight": [1]
              , "n_estimators": [1021]   #<best iteration (rounds) from last step 1, e.g. [1000]>
              , "subsample": [0.7]
              , "colsample_bytree": [0.7]
              , "nthread": [-1]
              , "silent" : [True]
              , "seed": [1234]}

model = search_model(train_X
            , train_y
            , xgb.XGBClassifier()
            , param_grid
            , 1
            , 4
            , False)



Fitting 4 folds for each of 1 candidates, totalling 4 fits
[CV] colsample_bytree=0.7, silent=True, learning_rate=0.02, nthread=-1, min_child_weight=1, n_estimators=1021, subsample=0.7, seed=1234, objective=multi:softprob, max_depth=6 
[CV]  colsample_bytree=0.7, silent=True, learning_rate=0.02, nthread=-1, min_child_weight=1, n_estimators=1021, subsample=0.7, seed=1234, objective=multi:softprob, max_depth=6, score=-0.774030 - 1.2min
[CV] colsample_bytree=0.7, silent=True, learning_rate=0.02, nthread=-1, min_child_weight=1, n_estimators=1021, subsample=0.7, seed=1234, objective=multi:softprob, max_depth=6 


[Parallel(n_jobs=1)]: Done   1 tasks       | elapsed:  1.2min


[CV]  colsample_bytree=0.7, silent=True, learning_rate=0.02, nthread=-1, min_child_weight=1, n_estimators=1021, subsample=0.7, seed=1234, objective=multi:softprob, max_depth=6, score=-0.753001 - 1.2min
[CV] colsample_bytree=0.7, silent=True, learning_rate=0.02, nthread=-1, min_child_weight=1, n_estimators=1021, subsample=0.7, seed=1234, objective=multi:softprob, max_depth=6 
[CV]  colsample_bytree=0.7, silent=True, learning_rate=0.02, nthread=-1, min_child_weight=1, n_estimators=1021, subsample=0.7, seed=1234, objective=multi:softprob, max_depth=6, score=-0.761113 - 1.2min
[CV] colsample_bytree=0.7, silent=True, learning_rate=0.02, nthread=-1, min_child_weight=1, n_estimators=1021, subsample=0.7, seed=1234, objective=multi:softprob, max_depth=6 
[CV]  colsample_bytree=0.7, silent=True, learning_rate=0.02, nthread=-1, min_child_weight=1, n_estimators=1021, subsample=0.7, seed=1234, objective=multi:softprob, max_depth=6, score=-0.756375 - 1.2min
Best score: -0.761
('Best parameters set:'

[Parallel(n_jobs=1)]: Done   4 tasks       | elapsed:  4.9min
[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:  4.9min finished


#### Step2 - tune max_depth and min_child_weight

###### Use learning rate 0.02 and the optimal rounds we got from step two, the do grid search for max_depth and min_child_weight
###### You can also plot the correlation between 

In [32]:
param_grid = {"objective": ["multi:softprob"]
              , "learning_rate": [0.02]
              , "max_depth": [5,6,7,8,9]
              , "min_child_weight": [1,3,5,7,9]
              , "n_estimators": [1021]   #<best iteration (rounds) from last step 1, e.g. [1000]>
              , "subsample": [0.7]
              , "colsample_bytree": [0.7]
              , "nthread": [-1]
              , "silent" : [False]
              , "seed": [1234]}

model = search_model(train_X
            , train_y
            , xgb.XGBClassifier()
            , param_grid
            , 1
            , 4
            , True)

print ("best max_depth:", model.best_params_['max_depth'])
print ("best min_child_weight:", model.best_params_['min_child_weight'])


Fitting 4 folds for each of 25 candidates, totalling 100 fits
[CV] colsample_bytree=0.7, silent=False, learning_rate=0.02, nthread=-1, min_child_weight=1, n_estimators=1021, subsample=0.7, seed=1234, objective=multi:softprob, max_depth=5 
[CV]  colsample_bytree=0.7, silent=False, learning_rate=0.02, nthread=-1, min_child_weight=1, n_estimators=1021, subsample=0.7, seed=1234, objective=multi:softprob, max_depth=5, score=-0.780728 - 1.1min
[CV] colsample_bytree=0.7, silent=False, learning_rate=0.02, nthread=-1, min_child_weight=1, n_estimators=1021, subsample=0.7, seed=1234, objective=multi:softprob, max_depth=5 


[Parallel(n_jobs=1)]: Done   1 tasks       | elapsed:  1.1min


[CV]  colsample_bytree=0.7, silent=False, learning_rate=0.02, nthread=-1, min_child_weight=1, n_estimators=1021, subsample=0.7, seed=1234, objective=multi:softprob, max_depth=5, score=-0.758178 - 1.1min
[CV] colsample_bytree=0.7, silent=False, learning_rate=0.02, nthread=-1, min_child_weight=1, n_estimators=1021, subsample=0.7, seed=1234, objective=multi:softprob, max_depth=5 
[CV]  colsample_bytree=0.7, silent=False, learning_rate=0.02, nthread=-1, min_child_weight=1, n_estimators=1021, subsample=0.7, seed=1234, objective=multi:softprob, max_depth=5, score=-0.766426 - 1.1min
[CV] colsample_bytree=0.7, silent=False, learning_rate=0.02, nthread=-1, min_child_weight=1, n_estimators=1021, subsample=0.7, seed=1234, objective=multi:softprob, max_depth=5 
[CV]  colsample_bytree=0.7, silent=False, learning_rate=0.02, nthread=-1, min_child_weight=1, n_estimators=1021, subsample=0.7, seed=1234, objective=multi:softprob, max_depth=5, score=-0.761541 - 1.0min
[CV] colsample_bytree=0.7, silent=Fal

[Parallel(n_jobs=1)]: Done   4 tasks       | elapsed:  4.3min


[CV]  colsample_bytree=0.7, silent=False, learning_rate=0.02, nthread=-1, min_child_weight=3, n_estimators=1021, subsample=0.7, seed=1234, objective=multi:softprob, max_depth=5, score=-0.781054 - 1.0min
[CV] colsample_bytree=0.7, silent=False, learning_rate=0.02, nthread=-1, min_child_weight=3, n_estimators=1021, subsample=0.7, seed=1234, objective=multi:softprob, max_depth=5 
[CV]  colsample_bytree=0.7, silent=False, learning_rate=0.02, nthread=-1, min_child_weight=3, n_estimators=1021, subsample=0.7, seed=1234, objective=multi:softprob, max_depth=5, score=-0.759038 - 1.0min
[CV] colsample_bytree=0.7, silent=False, learning_rate=0.02, nthread=-1, min_child_weight=3, n_estimators=1021, subsample=0.7, seed=1234, objective=multi:softprob, max_depth=5 
[CV]  colsample_bytree=0.7, silent=False, learning_rate=0.02, nthread=-1, min_child_weight=3, n_estimators=1021, subsample=0.7, seed=1234, objective=multi:softprob, max_depth=5, score=-0.767355 - 1.0min
[CV] colsample_bytree=0.7, silent=Fal

[Parallel(n_jobs=1)]: Done   7 tasks       | elapsed:  7.3min


[CV]  colsample_bytree=0.7, silent=False, learning_rate=0.02, nthread=-1, min_child_weight=3, n_estimators=1021, subsample=0.7, seed=1234, objective=multi:softprob, max_depth=5, score=-0.761783 -  58.4s
[CV] colsample_bytree=0.7, silent=False, learning_rate=0.02, nthread=-1, min_child_weight=5, n_estimators=1021, subsample=0.7, seed=1234, objective=multi:softprob, max_depth=5 
[CV]  colsample_bytree=0.7, silent=False, learning_rate=0.02, nthread=-1, min_child_weight=5, n_estimators=1021, subsample=0.7, seed=1234, objective=multi:softprob, max_depth=5, score=-0.781695 - 1.0min
[CV] colsample_bytree=0.7, silent=False, learning_rate=0.02, nthread=-1, min_child_weight=5, n_estimators=1021, subsample=0.7, seed=1234, objective=multi:softprob, max_depth=5 
[CV]  colsample_bytree=0.7, silent=False, learning_rate=0.02, nthread=-1, min_child_weight=5, n_estimators=1021, subsample=0.7, seed=1234, objective=multi:softprob, max_depth=5, score=-0.760726 - 1.0min
[CV] colsample_bytree=0.7, silent=Fal

[Parallel(n_jobs=1)]: Done  12 tasks       | elapsed: 12.4min


[CV]  colsample_bytree=0.7, silent=False, learning_rate=0.02, nthread=-1, min_child_weight=7, n_estimators=1021, subsample=0.7, seed=1234, objective=multi:softprob, max_depth=5, score=-0.782060 - 1.0min
[CV] colsample_bytree=0.7, silent=False, learning_rate=0.02, nthread=-1, min_child_weight=7, n_estimators=1021, subsample=0.7, seed=1234, objective=multi:softprob, max_depth=5 
[CV]  colsample_bytree=0.7, silent=False, learning_rate=0.02, nthread=-1, min_child_weight=7, n_estimators=1021, subsample=0.7, seed=1234, objective=multi:softprob, max_depth=5, score=-0.761501 - 1.0min
[CV] colsample_bytree=0.7, silent=False, learning_rate=0.02, nthread=-1, min_child_weight=7, n_estimators=1021, subsample=0.7, seed=1234, objective=multi:softprob, max_depth=5 
[CV]  colsample_bytree=0.7, silent=False, learning_rate=0.02, nthread=-1, min_child_weight=7, n_estimators=1021, subsample=0.7, seed=1234, objective=multi:softprob, max_depth=5, score=-0.768855 - 1.0min
[CV] colsample_bytree=0.7, silent=Fal

[Parallel(n_jobs=1)]: Done  17 tasks       | elapsed: 17.6min


[CV]  colsample_bytree=0.7, silent=False, learning_rate=0.02, nthread=-1, min_child_weight=9, n_estimators=1021, subsample=0.7, seed=1234, objective=multi:softprob, max_depth=5, score=-0.762788 -  58.6s
[CV] colsample_bytree=0.7, silent=False, learning_rate=0.02, nthread=-1, min_child_weight=9, n_estimators=1021, subsample=0.7, seed=1234, objective=multi:softprob, max_depth=5 
[CV]  colsample_bytree=0.7, silent=False, learning_rate=0.02, nthread=-1, min_child_weight=9, n_estimators=1021, subsample=0.7, seed=1234, objective=multi:softprob, max_depth=5, score=-0.770169 -  56.1s
[CV] colsample_bytree=0.7, silent=False, learning_rate=0.02, nthread=-1, min_child_weight=9, n_estimators=1021, subsample=0.7, seed=1234, objective=multi:softprob, max_depth=5 
[CV]  colsample_bytree=0.7, silent=False, learning_rate=0.02, nthread=-1, min_child_weight=9, n_estimators=1021, subsample=0.7, seed=1234, objective=multi:softprob, max_depth=5, score=-0.763757 - 1.0min
[CV] colsample_bytree=0.7, silent=Fal

[Parallel(n_jobs=1)]: Done  24 tasks       | elapsed: 25.3min


[CV]  colsample_bytree=0.7, silent=False, learning_rate=0.02, nthread=-1, min_child_weight=3, n_estimators=1021, subsample=0.7, seed=1234, objective=multi:softprob, max_depth=6, score=-0.774778 - 1.2min
[CV] colsample_bytree=0.7, silent=False, learning_rate=0.02, nthread=-1, min_child_weight=3, n_estimators=1021, subsample=0.7, seed=1234, objective=multi:softprob, max_depth=6 
[CV]  colsample_bytree=0.7, silent=False, learning_rate=0.02, nthread=-1, min_child_weight=3, n_estimators=1021, subsample=0.7, seed=1234, objective=multi:softprob, max_depth=6, score=-0.753202 - 1.2min
[CV] colsample_bytree=0.7, silent=False, learning_rate=0.02, nthread=-1, min_child_weight=3, n_estimators=1021, subsample=0.7, seed=1234, objective=multi:softprob, max_depth=6 
[CV]  colsample_bytree=0.7, silent=False, learning_rate=0.02, nthread=-1, min_child_weight=3, n_estimators=1021, subsample=0.7, seed=1234, objective=multi:softprob, max_depth=6, score=-0.762730 - 1.2min
[CV] colsample_bytree=0.7, silent=Fal

[Parallel(n_jobs=1)]: Done  31 tasks       | elapsed: 33.9min


[CV]  colsample_bytree=0.7, silent=False, learning_rate=0.02, nthread=-1, min_child_weight=5, n_estimators=1021, subsample=0.7, seed=1234, objective=multi:softprob, max_depth=6, score=-0.756376 - 1.2min
[CV] colsample_bytree=0.7, silent=False, learning_rate=0.02, nthread=-1, min_child_weight=7, n_estimators=1021, subsample=0.7, seed=1234, objective=multi:softprob, max_depth=6 
[CV]  colsample_bytree=0.7, silent=False, learning_rate=0.02, nthread=-1, min_child_weight=7, n_estimators=1021, subsample=0.7, seed=1234, objective=multi:softprob, max_depth=6, score=-0.776935 - 1.2min
[CV] colsample_bytree=0.7, silent=False, learning_rate=0.02, nthread=-1, min_child_weight=7, n_estimators=1021, subsample=0.7, seed=1234, objective=multi:softprob, max_depth=6 
[CV]  colsample_bytree=0.7, silent=False, learning_rate=0.02, nthread=-1, min_child_weight=7, n_estimators=1021, subsample=0.7, seed=1234, objective=multi:softprob, max_depth=6, score=-0.756810 - 1.2min
[CV] colsample_bytree=0.7, silent=Fal

[Parallel(n_jobs=1)]: Done  40 tasks       | elapsed: 44.5min


[CV]  colsample_bytree=0.7, silent=False, learning_rate=0.02, nthread=-1, min_child_weight=1, n_estimators=1021, subsample=0.7, seed=1234, objective=multi:softprob, max_depth=7, score=-0.771815 - 1.4min
[CV] colsample_bytree=0.7, silent=False, learning_rate=0.02, nthread=-1, min_child_weight=1, n_estimators=1021, subsample=0.7, seed=1234, objective=multi:softprob, max_depth=7 
[CV]  colsample_bytree=0.7, silent=False, learning_rate=0.02, nthread=-1, min_child_weight=1, n_estimators=1021, subsample=0.7, seed=1234, objective=multi:softprob, max_depth=7, score=-0.750326 - 1.4min
[CV] colsample_bytree=0.7, silent=False, learning_rate=0.02, nthread=-1, min_child_weight=1, n_estimators=1021, subsample=0.7, seed=1234, objective=multi:softprob, max_depth=7 
[CV]  colsample_bytree=0.7, silent=False, learning_rate=0.02, nthread=-1, min_child_weight=1, n_estimators=1021, subsample=0.7, seed=1234, objective=multi:softprob, max_depth=7, score=-0.759654 - 1.4min
[CV] colsample_bytree=0.7, silent=Fal

[Parallel(n_jobs=1)]: Done  49 tasks       | elapsed: 57.5min


[CV]  colsample_bytree=0.7, silent=False, learning_rate=0.02, nthread=-1, min_child_weight=5, n_estimators=1021, subsample=0.7, seed=1234, objective=multi:softprob, max_depth=7, score=-0.753173 - 1.5min
[CV] colsample_bytree=0.7, silent=False, learning_rate=0.02, nthread=-1, min_child_weight=5, n_estimators=1021, subsample=0.7, seed=1234, objective=multi:softprob, max_depth=7 
[CV]  colsample_bytree=0.7, silent=False, learning_rate=0.02, nthread=-1, min_child_weight=5, n_estimators=1021, subsample=0.7, seed=1234, objective=multi:softprob, max_depth=7, score=-0.761472 - 1.5min
[CV] colsample_bytree=0.7, silent=False, learning_rate=0.02, nthread=-1, min_child_weight=5, n_estimators=1021, subsample=0.7, seed=1234, objective=multi:softprob, max_depth=7 
[CV]  colsample_bytree=0.7, silent=False, learning_rate=0.02, nthread=-1, min_child_weight=5, n_estimators=1021, subsample=0.7, seed=1234, objective=multi:softprob, max_depth=7, score=-0.752290 - 1.4min
[CV] colsample_bytree=0.7, silent=Fal

[Parallel(n_jobs=1)]: Done  60 tasks       | elapsed: 73.7min


[CV]  colsample_bytree=0.7, silent=False, learning_rate=0.02, nthread=-1, min_child_weight=1, n_estimators=1021, subsample=0.7, seed=1234, objective=multi:softprob, max_depth=8, score=-0.773080 - 1.7min
[CV] colsample_bytree=0.7, silent=False, learning_rate=0.02, nthread=-1, min_child_weight=1, n_estimators=1021, subsample=0.7, seed=1234, objective=multi:softprob, max_depth=8 
[CV]  colsample_bytree=0.7, silent=False, learning_rate=0.02, nthread=-1, min_child_weight=1, n_estimators=1021, subsample=0.7, seed=1234, objective=multi:softprob, max_depth=8, score=-0.749978 - 1.8min
[CV] colsample_bytree=0.7, silent=False, learning_rate=0.02, nthread=-1, min_child_weight=1, n_estimators=1021, subsample=0.7, seed=1234, objective=multi:softprob, max_depth=8 
[CV]  colsample_bytree=0.7, silent=False, learning_rate=0.02, nthread=-1, min_child_weight=1, n_estimators=1021, subsample=0.7, seed=1234, objective=multi:softprob, max_depth=8, score=-0.760817 - 1.7min
[CV] colsample_bytree=0.7, silent=Fal

[Parallel(n_jobs=1)]: Done  71 tasks       | elapsed: 92.4min


[CV]  colsample_bytree=0.7, silent=False, learning_rate=0.02, nthread=-1, min_child_weight=5, n_estimators=1021, subsample=0.7, seed=1234, objective=multi:softprob, max_depth=8, score=-0.750317 - 1.7min
[CV] colsample_bytree=0.7, silent=False, learning_rate=0.02, nthread=-1, min_child_weight=7, n_estimators=1021, subsample=0.7, seed=1234, objective=multi:softprob, max_depth=8 
[CV]  colsample_bytree=0.7, silent=False, learning_rate=0.02, nthread=-1, min_child_weight=7, n_estimators=1021, subsample=0.7, seed=1234, objective=multi:softprob, max_depth=8, score=-0.773864 - 1.6min
[CV] colsample_bytree=0.7, silent=False, learning_rate=0.02, nthread=-1, min_child_weight=7, n_estimators=1021, subsample=0.7, seed=1234, objective=multi:softprob, max_depth=8 
[CV]  colsample_bytree=0.7, silent=False, learning_rate=0.02, nthread=-1, min_child_weight=7, n_estimators=1021, subsample=0.7, seed=1234, objective=multi:softprob, max_depth=8, score=-0.753483 - 1.7min
[CV] colsample_bytree=0.7, silent=Fal

[Parallel(n_jobs=1)]: Done  84 tasks       | elapsed: 115.0min


[CV]  colsample_bytree=0.7, silent=False, learning_rate=0.02, nthread=-1, min_child_weight=3, n_estimators=1021, subsample=0.7, seed=1234, objective=multi:softprob, max_depth=9, score=-0.772504 - 1.9min
[CV] colsample_bytree=0.7, silent=False, learning_rate=0.02, nthread=-1, min_child_weight=3, n_estimators=1021, subsample=0.7, seed=1234, objective=multi:softprob, max_depth=9 
[CV]  colsample_bytree=0.7, silent=False, learning_rate=0.02, nthread=-1, min_child_weight=3, n_estimators=1021, subsample=0.7, seed=1234, objective=multi:softprob, max_depth=9, score=-0.751422 - 1.9min
[CV] colsample_bytree=0.7, silent=False, learning_rate=0.02, nthread=-1, min_child_weight=3, n_estimators=1021, subsample=0.7, seed=1234, objective=multi:softprob, max_depth=9 
[CV]  colsample_bytree=0.7, silent=False, learning_rate=0.02, nthread=-1, min_child_weight=3, n_estimators=1021, subsample=0.7, seed=1234, objective=multi:softprob, max_depth=9, score=-0.764233 - 1.9min
[CV] colsample_bytree=0.7, silent=Fal

[Parallel(n_jobs=1)]: Done  97 tasks       | elapsed: 138.7min


[CV]  colsample_bytree=0.7, silent=False, learning_rate=0.02, nthread=-1, min_child_weight=9, n_estimators=1021, subsample=0.7, seed=1234, objective=multi:softprob, max_depth=9, score=-0.755938 - 1.6min
[CV] colsample_bytree=0.7, silent=False, learning_rate=0.02, nthread=-1, min_child_weight=9, n_estimators=1021, subsample=0.7, seed=1234, objective=multi:softprob, max_depth=9 
[CV]  colsample_bytree=0.7, silent=False, learning_rate=0.02, nthread=-1, min_child_weight=9, n_estimators=1021, subsample=0.7, seed=1234, objective=multi:softprob, max_depth=9, score=-0.764918 - 1.6min
[CV] colsample_bytree=0.7, silent=False, learning_rate=0.02, nthread=-1, min_child_weight=9, n_estimators=1021, subsample=0.7, seed=1234, objective=multi:softprob, max_depth=9 
[CV]  colsample_bytree=0.7, silent=False, learning_rate=0.02, nthread=-1, min_child_weight=9, n_estimators=1021, subsample=0.7, seed=1234, objective=multi:softprob, max_depth=9, score=-0.754472 - 1.7min


[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed: 143.7min finished


Best score: -0.758
('Best parameters set:', {'colsample_bytree': 0.7, 'silent': False, 'learning_rate': 0.02, 'nthread': -1, 'min_child_weight': 3, 'n_estimators': 1021, 'subsample': 0.7, 'seed': 1234, 'objective': 'multi:softprob', 'max_depth': 8})
('Scores:', [mean: -0.76672, std: 0.00860, params: {'colsample_bytree': 0.7, 'silent': False, 'learning_rate': 0.02, 'nthread': -1, 'min_child_weight': 1, 'n_estimators': 1021, 'subsample': 0.7, 'seed': 1234, 'objective': 'multi:softprob', 'max_depth': 5}, mean: -0.76731, std: 0.00848, params: {'colsample_bytree': 0.7, 'silent': False, 'learning_rate': 0.02, 'nthread': -1, 'min_child_weight': 3, 'n_estimators': 1021, 'subsample': 0.7, 'seed': 1234, 'objective': 'multi:softprob', 'max_depth': 5}, mean: -0.76821, std: 0.00827, params: {'colsample_bytree': 0.7, 'silent': False, 'learning_rate': 0.02, 'nthread': -1, 'min_child_weight': 5, 'n_estimators': 1021, 'subsample': 0.7, 'seed': 1234, 'objective': 'multi:softprob', 'max_depth': 5}, mean:

#### Step3 - tune colsample_bytre and sub_sample

Use learning rate and the optimal rounds we got from step 1, max_depth and min_child_weight from step 2, the grid search for colsample_bytree and subsample


In [None]:
param_grid = {"objective": ["multi:softprob"]
              , "learning_rate": [0.02]
              , "max_depth": [8] #<fill in with the best max_depth from step 2, e.g. [7]>
              , "min_child_weight": [5] #<fill in with the best min_child_weight from last step, e.g. [1]>
              , "n_estimators": [1021] #<fill in with best iteration (rounds) from last step, e.g. [1000]>
              , "subsample": [0.5,0.6,0.7,0.8,0.9]
              , "colsample_bytree": [0.5,0.6,0.7,0.8,0.9]
              , "nthread": [-1]
              , "silent" : [False]
              , "seed": [1234]}

model = search_model(full_data[:data_params['train_size']][full_cols].values
            , full_data[:data_params['train_size']][data_params['tgt_col']].values.reshape(data_params['train_size'])
            , xgb.XGBClassifier()
            , param_grid
            , 1
            , 4
            , True)

print ("best subsample:", model.best_params_['subsample'])
print ("best colsample_bytree:", model.best_params_['colsample_bytree'])

#### Step 4 - decrease learning rate for better performace

Use learning rate 0.01 tuned parameters max_depth, min_childe_weight, subsample and colsample_bytreee from step 2 and 3, repeat early stopping for better rounds



In [None]:
params = {     "objective": "multi:softprob"
          , "num_class": 6
          , "booster":  "gbtree"
          , "eval_metric":  "mlogloss"
          , "eta": 0.01
          , "subsample": <fill in with the best subsample from step 3, e.g. 0.7>
          , "colsample_bytree": <fill in with the best colsample_bytree from step 3, e.g. 0.7>
          , "max_depth": <fill in with the best max_depth from step 2, e.g. 7>
          , "min_child_weight": <fill in with the best min_child_weight from step 2, e.g.1>
          , "seed" : 1234
         }

model = xgb_train(full_data[:data_params['train_size']][full_cols].fillna(-999).values
                    ,full_data[:data_params['train_size']][data_params['tgt_col']].fillna(-999).values.reshape(data_params['train_size'])
                    ,params)

You've got the tuned parameters for learning rate, rounds, max_depth, min_child_weight, colsample_bytree and subsple. Now cross validate your model to see the improvements

In [None]:
param_grid = {"objective": ["multi:softprob"]
              , "learning_rate": [0.001]
              , "max_depth": [] #<fill in with the best max_depth from step 3, e.g. 0.7>
              , "min_child_weight": [] #<fill in with the best min_child_weight from step 3, e.g. 0.7>
              , "n_estimators": []   #<best iteration (rounds) from last step 1, e.g. [1000]>
              , "subsample": [] #<fill in with the best subsample from step 3, e.g. 0.7>
              , "colsample_bytree": [0.75] #<fill in with the best colsample_bytree from step 3, e.g. 0.7>
              , "nthread": [-1]
              , "silent" : [False]
              , "seed": [1234]}

model = search_model(full_data[:data_params['train_size']][full_cols].values
            , full_data[:data_params['train_size']][data_params['tgt_col']].values.reshape(data_params['train_size'])
            , xgb.XGBClassifier()
            , param_grid
            , 1
            , 4
            , True)

print ("best max_depth:", model.best_params_['max_depth'])
print ("best min_child_weight:", model.best_params_['min_child_weight'])



###### Homework

1. Early stopping is helpful to optimize the rounds needed. However it's somehow arbitary and can be improved by ensemble method. Hint: reverse the traing set and target for early stopping.
2. Try to tune parameters that were not covered by this notebook, e.g. gamma, alpha, and check the impacts.
3. There's a chance the parameters we tuned are not the best due to the step size, i.e. we may want to fine tune colsample_bytree and subsample by using smaller steps such as [0.5,0.52,0.54,0.56,0.58,0.60,0.62....0.9]