# Xgboost Parameter Tuning 
###### using Zillow Home Value from Kaggle as an Example

In [1]:
import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn.preprocessing import LabelEncoder

In [2]:
# Read in raw data, remember download the data from https://www.kaggle.com/c/zillow-prize-1/data and change path that fit in your 
PATH = '/Users/johnnychiu/Desktop/MyFiles/learning/kaggle/9.Zillow-Home-Value-Prediction'
properties = pd.read_csv(PATH + '/input/properties_2016.csv')
train = pd.read_csv(PATH + '/input/train_2016_v2.csv')

  interactivity=interactivity, compiler=compiler, result=result)


In [3]:
properties.dtypes

parcelid                          int64
airconditioningtypeid           float64
architecturalstyletypeid        float64
basementsqft                    float64
bathroomcnt                     float64
bedroomcnt                      float64
buildingclasstypeid             float64
buildingqualitytypeid           float64
calculatedbathnbr               float64
decktypeid                      float64
finishedfloor1squarefeet        float64
calculatedfinishedsquarefeet    float64
finishedsquarefeet12            float64
finishedsquarefeet13            float64
finishedsquarefeet15            float64
finishedsquarefeet50            float64
finishedsquarefeet6             float64
fips                            float64
fireplacecnt                    float64
fullbathcnt                     float64
garagecarcnt                    float64
garagetotalsqft                 float64
hashottuborspa                   object
heatingorsystemtypeid           float64
latitude                        float64


### Data Transformation
Before we start, we would like to transform the dataframe for a bit for us to apply that to xgboost. Using the code from the kernel(https://www.kaggle.com/aharless/xgb-w-o-outliers-lgb-with-outliers-combined/code), we managed to do it. Keep in mind that this part isn't the main focus of this notebook.

In [4]:
# process data for xgboost

print("\nProcessing data for XGBoost ...")
for c in properties.columns:
    properties[c] = properties[c].fillna(-1)
    if properties[c].dtype == 'object':
        lbl = LabelEncoder()
        lbl.fit(list(properties[c].values))
        properties[c] = lbl.transform(list(properties[c].values))

train_df = train.merge(properties, how='left', on='parcelid')
x_train = train_df.drop(['parcelid', 'logerror', 'transactiondate'], axis=1)
x_test = properties.drop(['parcelid'], axis=1)
# shape        
print('Shape train: {}\nShape test: {}'.format(x_train.shape, x_test.shape))

# drop out ouliers & downsize the train data by filter extreme cases
train_df = train_df[train_df.logerror > -0.4]
train_df = train_df[train_df.logerror < 0.418]

x_train = train_df.drop(['parcelid', 'logerror', 'transactiondate'], axis=1)
y_train = train_df["logerror"].values.astype(np.float32)
y_mean = np.mean(y_train)

print('After removing outliers:')
print('Shape train: {}\nShape test: {}'.format(x_train.shape, x_test.shape))


Processing data for XGBoost ...
Shape train: (90275, 57)
Shape test: (2985217, 57)
After removing outliers:
Shape train: (88525, 57)
Shape test: (2985217, 57)


### Downsizing the dataframe
The original train size is huge. We take a random sample of it for the purpose of illustrating.

In [5]:
# drop out ouliers & downsize the train data by random sampling
import random
random.seed(33)
sel_parcelid = random.sample(train_df.parcelid.unique(), 200)
train_df = train_df[train_df['parcelid'].isin(sel_parcelid)].reset_index(drop=True)

x_train = train_df.drop(['parcelid', 'logerror', 'transactiondate'], axis=1)
y_train = train_df["logerror"].values.astype(np.float32)
y_mean = np.mean(y_train)

### Parameter Tuning
After we transformed our dataframe, we can now start to do the parameter tuning.

##### Step 1: Fix learning rate and number of estimators for tuning tree-based parameters
###### description
* Choose a relatively high learning rate. Generally a learning rate of 0.1 works but somewhere between 0.05 to 0.3
* should work for different problems. Determine the optimum number of trees for this learning rate.
* XGBoost has a very useful function called as "cv" which performs cross-validation at each boosting iteration and thus
* returns the optimum number of trees required. 

###### note
* to determine the optimum number of trees for this learning rate, the parameter for the "optimum number of trees" is
* called n_estimators in XGBClassifier; num_boost_round in xgboost


In [8]:
from xgboost.sklearn import XGBClassifier

xgb1 = XGBClassifier(
    learning_rate=0.2,
    n_estimators=100,
    max_depth=5,
    min_child_weight=1,
    gamma=0,
    subsample=0.8,
    colsample_bytree=0.8,
    objective='reg:linear',
    n_jobs=4,
    scale_pos_weight=1,
    random_state=27)

In [8]:
def find_n_estimators(alg, x_train, y_train, cv_folds=5, early_stopping_rounds=10, fitting_model=False):
    xgb_param = alg.get_xgb_params()
    xgtrain = xgb.DMatrix(x_train, label=y_train)
    cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=alg.get_params()['n_estimators'], nfold=cv_folds,
                      metrics='mae', early_stopping_rounds=early_stopping_rounds)
    print cvresult
    alg.set_params(n_estimators=cvresult.shape[0])
    
    if fitting_model==True:
        # Fit the algorithm on the data
        alg.fit(x_train, y_train, eval_metric='mae')

        # Predict training set:
        # dtrain_predictions = alg.predict(x_train)
        dtrain_predprob = alg.predict_proba(x_train)[:, 1]

        # Print model report:
        print "\nModel Report"
        # print "Accuracy : %.4g" % metrics.accuracy_score(y_train, dtrain_predictions)
        # print "AUC Score (Train): %f" % metrics.roc_auc_score(dtrain['Disbursed'], dtrain_predprob)
        # http://scikit-learn.org/stable/modules/classes.html#sklearn-metrics-metrics
        print "MAE (Train): %f" % metrics.mean_squared_error(y_train, dtrain_predprob)

    return alg

In [None]:
xgb1=find_n_estimators(xgb1, x_train, y_train, fitting_model=True)

    test-mae-mean  test-mae-std  train-mae-mean  train-mae-std
0        0.393911      0.000265        0.393915       0.000048
1        0.315719      0.000293        0.315702       0.000028
2        0.254037      0.000287        0.253986       0.000054
3        0.205583      0.000276        0.205486       0.000073
4        0.167691      0.000234        0.167546       0.000065
5        0.138250      0.000232        0.138057       0.000065
6        0.115566      0.000254        0.115335       0.000075
7        0.098282      0.000278        0.098006       0.000093
8        0.085276      0.000300        0.084966       0.000082
9        0.075682      0.000261        0.075331       0.000091
10       0.068708      0.000252        0.068324       0.000066
11       0.063783      0.000231        0.063363       0.000059
12       0.060324      0.000195        0.059879       0.000065
13       0.057940      0.000183        0.057477       0.000047
14       0.056315      0.000179        0.055811       0

##### Step 2: Tune max_depth and min_child_weight
###### description
* We tune these first as they will have the highest impact on model outcome. To start with, let's set wider ranges
* and then we will perform another iteration for smaller ranges.

###### note
* GridSearchCV documentation -> http://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html
* scoring parameters -> http://scikit-learn.org/stable/modules/model_evaluation.html


In [None]:
from sklearn.model_selection import GridSearchCV

param_test1 = {
 'max_depth':range(3,10,2),
 'min_child_weight':range(1,6,2)
}
gsearch1 = GridSearchCV(estimator = xgb1,
 param_grid = param_test1, scoring='neg_mean_squared_error',iid=False, cv=5)
gsearch1.fit(x_train, y_train)
gsearch1.cv_results_, gsearch1.best_params_, gsearch1.best_score_

In [None]:
param_test1 = {
 'max_depth':range(3,10,2),
 'min_child_weight':range(1,6,2)
}

In [None]:
def gridsearch_parameter_tuning(xgb, param_test):
    gsearch1 = GridSearchCV(estimator = xgb1,
    param_grid = param_test1, scoring='neg_mean_squared_error',iid=False, cv=5)
    gsearch1.fit(x_train, y_train)
    print('cv result:{}'.format(gsearch1.cv_results_)) 
    print('best parameters:{}'.format(gsearch1.best_params_)) 
    print('best score:{}'.format(gsearch1.best_score_)) 
    
    for key in param_test.keys():        
        xgb.set_params(key=gsearch1.best_params_[key])

    return xgb

In [None]:
xgb2 = gridsearch_parameter_tuning(xgb1)

Lets go one step deeper and look for optimum values. We'll search for values 1 above and below the optimum values,
because we took an interval of two.

In [None]:
param_test2 = {
 'max_depth':[3,4,5,6],
 'min_child_weight':[1,2,3]
}

In [None]:
xgb2 = gridsearch_parameter_tuning(xgb1)

##### Step 3: Tune gamma
###### description
* Now lets tune gamma value using the parameters already tuned above. Gamma can take various values but I'll check
* for 5 values here. You can go into more precise values as.
###### note
* gamma [default=0] A node is split only when the resulting split gives a positive reduction in the loss function.
* Gamma specifies the minimum loss reduction required to make a split. Higher gamma makes the algorithm conservative.
* The values can vary depending on the loss function and should be tuned.


In [None]:
param_test3 = {
 'gamma':[i/10.0 for i in range(0,5)]
}

xgb3 = gridsearch_parameter_tuning(xgb2)

##### Step 4: Tune subsample and colsample_bytree
###### description
* The next step would be try different subsample and colsample_bytree values. Lets do this in 2 stages as well and
* take values 0.6,0.7,0.8,0.9 for both to start with.

In [None]:
param_test4 = {
 'subsample':[i/10.0 for i in range(6,10)],
 'colsample_bytree':[i/10.0 for i in range(6,10)]
}

xgb4 = gridsearch_parameter_tuning(xgb3)

Now we should try values in 0.05 interval around the optimum value we just got.

In [None]:
param_test5 = {
    'colsample_bytree':[i/10.0 for i in range(3,7)]
}
xgb5 = gridsearch_parameter_tuning(xgb4)

##### Step 5: Tuning Regularization Parameters
###### description
* Next step is to apply regularization to reduce overfitting. Though many people don't use this parameters much as
* gamma provides a substantial way of controlling complexity. But we should always try it. I'll tune 'reg_alpha'
* value here and leave it up to you to try different values of 'reg_lambda'.
* Tune regularization parameters (lambda, alpha) for xgboost which can help reduce model complexity and enhance performance.

In [None]:
param_test6 = {
 'reg_alpha':[1e-5, 1e-2, 0.1, 1, 100]
}
xgb6 = gridsearch_parameter_tuning(xgb5)

In [None]:
param_test7 = {
 'reg_alpha':[0, 0.005, 0.01, 0.03, 0.05]
}
xgb7 = gridsearch_parameter_tuning(xgb6)

##### Step 6: Reducing Learning Rate
###### description
* Lastly, we should lower the learning rate and add more trees. Lets use the cv function of XGBoost to do the job again.
* Lower the learning rate and decide the optimal parameters.

In [None]:
xgb7.set_params(n_estimators=100)
xgb7.set_params(learning_rate = 0.1)
modelfit(xgb7, x_train, y_train, fitting_model=True)

In [None]:
xgb7.set_params(n_estimators=200)
xgb7.set_params(learning_rate = 0.05)
modelfit(xgb7, x_train, y_train, fitting_model=True)

In [None]:
xgb7.set_params(n_estimators=300)
xgb7.set_params(learning_rate = 0.03)
modelfit(xgb7, x_train, y_train, fitting_model=True)

After parameter tuning, our score decreases from ? to ?. These are the parameters that we have tried our best to tune.

### Build Model

In [None]:
print("\nSetting up data for XGBoost ...")
# xgboost params
new_xgb_params = {
    'eta': 0.05,
    'num_boost_round': 105,
    'max_depth': 3,
    'min_child_weight': 2,
    'gamma': 0.1,
    'subsample': 0.8,
    'colsample_bytree': 0.6,
    'objective': 'reg:linear',
    'nthread': 4,
    'scale_pos_weight': 1,
    'seed': 27,
    'alpha': 0.01,
    'eval_metric': 'mae',
    'silent': 0
}

In [None]:
dtrain = xgb.DMatrix(x_train, y_train)
dtest = xgb.DMatrix(x_test)

In [None]:
# train model
print("\nTraining XGBoost ...")
model = xgb.train(dict(new_xgb_params), dtrain, num_boost_round=105)

print("\nPredicting with XGBoost ...")
xgb_pred = model.predict(dtest)

print("\nXGBoost predictions:")
print(pd.DataFrame(xgb_pred).head())

### WRITE THE RESULTS

In [None]:
print("\nPreparing results for write ...")
y_pred = []

for i, predict in enumerate(xgb_pred):
    y_pred.append(str(round(predict, 4)))
y_pred = np.array(y_pred)

output = pd.DataFrame({'ParcelId': properties['parcelid'].astype(np.int32),
                       '201610': y_pred, '201611': y_pred, '201612': y_pred,
                       '201710': y_pred, '201711': y_pred, '201712': y_pred})
# set col 'ParceID' to first col
cols = output.columns.tolist()
cols = cols[-1:] + cols[:-1]
output = output[cols]
from datetime import datetime

print("\nWriting results to disk ...")
output.to_csv(
    PATH + '/20170726_XGB-LGB-combined/_submission/sub{}.csv'.format(datetime.now().strftime('%Y%m%d_%H%M%S')),
    index=False)

print("\nFinished ...")

reference
* https://www.analyticsvidhya.com/blog/2016/03/complete-guide-parameter-tuning-xgboost-with-codes-python/
* https://www.kaggle.com/aharless/xgb-w-o-outliers-lgb-with-outliers-combined/code