In [3]:
#I want to get to know gradient boosting methods (in particular, the xgboost library) and i am also currently in barbados.
#Import libraries:
import numpy as np
import pandas as pd
import xgboost as xgb
import time
#load data:
train = pd.read_csv("train.csv")
target = train['target']
#drop targets & (unique row) IDs from training data
train = train.drop(['ID','target'],axis=1)
test = pd.read_csv("test.csv")
IDs = test['ID'].values
test = test.drop(['ID'],axis=1)

# PREPROCESSING

In [4]:
#impute both numerical & categorical features a la
#http://stackoverflow.com/questions/25239958/impute-categorical-missing-values-in-scikit-learn

from sklearn.base import TransformerMixin

class DataFrameImputer(TransformerMixin):

    def __init__(self):
        """Impute missing values.

        Columns of dtype object are imputed with the most frequent value 
        in column.

        Columns of other types are imputed with mean of column.

        """
    def fit(self, X, y=None):

        self.fill = pd.Series([X[c].value_counts().index[0]
            if X[c].dtype == np.dtype('O') else X[c].mean() for c in X],
            index=X.columns)

        return self

    def transform(self, X, y=None):
        return X.fillna(self.fill)
    
xtrain = DataFrameImputer().fit_transform( train )
xtest = DataFrameImputer().fit_transform( test )

In [6]:
#factorize categorical columns:
for column in xtrain:
    if xtrain[column].dtype == 'O':
#         print pd.factorize(xtrain[column])
        xtrain[column] = pd.factorize(xtrain[column])[0]
    
for column in xtest:
    if xtest[column].dtype == 'O':
#         print pd.factorize(xtrain[column])
        xtest[column] = pd.factorize(xtest[column])[0]

Next up: scaling/transforms/get_dummies/dimensionality reduction

# GRADIENT BOOSTING & CROSS VALIDATION

In [8]:
#check this out: http://xgboost.readthedocs.org/en/latest/model.html
from sklearn.cross_validation import KFold, train_test_split
X = xtrain.values
y = target.values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.1 , random_state=0)



In [9]:
# Early-stopping
#http://xgboost.readthedocs.org/en/latest/python/python_intro.html#early-stopping
#Also see https://github.com/dmlc/xgboost/blob/master/demo/guide-python/sklearn_examples.py (Jamie Hall et al.)
clf = xgb.XGBClassifier()
clf.fit(X_train, y_train, early_stopping_rounds=50, eval_metric="logloss",
        eval_set=[(X_test, y_test)])

Will train until validation_0 error hasn't decreased in 50 rounds.
[0]	validation_0-logloss:0.660512
[1]	validation_0-logloss:0.633833
[2]	validation_0-logloss:0.611452
[3]	validation_0-logloss:0.592876
[4]	validation_0-logloss:0.577408
[5]	validation_0-logloss:0.564382
[6]	validation_0-logloss:0.553405
[7]	validation_0-logloss:0.543860
[8]	validation_0-logloss:0.536023
[9]	validation_0-logloss:0.528992
[10]	validation_0-logloss:0.523194
[11]	validation_0-logloss:0.518142
[12]	validation_0-logloss:0.514029
[13]	validation_0-logloss:0.510511
[14]	validation_0-logloss:0.507166
[15]	validation_0-logloss:0.504451
[16]	validation_0-logloss:0.502286
[17]	validation_0-logloss:0.500022
[18]	validation_0-logloss:0.498165
[19]	validation_0-logloss:0.496556
[20]	validation_0-logloss:0.495312
[21]	validation_0-logloss:0.493927
[22]	validation_0-logloss:0.492970
[23]	validation_0-logloss:0.491965
[24]	validation_0-logloss:0.491137
[25]	validation_0-logloss:0.490338
[26]	validation_0-logloss:0.48966

XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=1,
       gamma=0, learning_rate=0.1, max_delta_step=0, max_depth=3,
       min_child_weight=1, missing=None, n_estimators=100, nthread=-1,
       objective='binary:logistic', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=0, silent=True, subsample=1)

In [14]:
preds = clf.predict_proba(xtest.values, ntree_limit=clf.best_iteration)[:,1]

In [16]:
import csv
predictions_file = open("xgboost_predictions.csv", "w")
open_file_object = csv.writer(predictions_file)
open_file_object.writerow(["ID", "PredictedProb"])
open_file_object.writerows(zip(IDs, preds))
predictions_file.close()

This above performed okay: logloss = -0.5252. But I think we need to increase num_rounds and at least try to change preprocessing:

# TESTING ANOTHER APPROACH

Loading & preprocessing:

In [53]:
#https://www.kaggle.com/director/bnp-paribas-cardif-claims-management/simple-xgboost-0-46146/code
print('Load data...')
train = pd.read_csv("train.csv")
target = train['target']
train = train.drop(['ID','target'],axis=1)
test = pd.read_csv("test.csv")
ids = test['ID'].values
test = test.drop(['ID'],axis=1)
#
print('Clearing...')
for (train_name, train_series), (test_name, test_series) in zip(train.iteritems(),test.iteritems()):
    if train_series.dtype == 'O':
        #for objects: factorize
        train[train_name], tmp_indexer = pd.factorize(train[train_name])
        test[test_name] = tmp_indexer.get_indexer(test[test_name])
        #but now we have -1 values (NaN)
    else:
        #for int or float: fill NaN
        tmp_len = len(train[train_series.isnull()])
        if tmp_len>0:
            train.loc[train_series.isnull(), train_name] = train_series.mean()
        #and Test
        tmp_len = len(test[test_series.isnull()])
        if tmp_len>0:
            test.loc[test_series.isnull(), test_name] = train_series.mean()  #TODO

Load data...
Clearing...


A little function to report best scores (from cross validation):

In [60]:
from operator import itemgetter
# Utility function to report best scores
def report(grid_scores, n_top=3):
    top_scores = sorted(grid_scores, key=itemgetter(1), reverse=True)[:n_top]
    for i, score in enumerate(top_scores):
        print("Model with rank: {0}".format(i + 1))
        print("Mean validation score: {0:.3f} (std: {1:.3f})".format(
              score.mean_validation_score,
              np.std(score.cv_validation_scores)))
        print("Parameters: {0}".format(score.parameters))
        print("")

Now we perform a randomizedsearchCV over a number of parameters (using xgb.XGBClassifier()) --
I do this because I don't know how to do it with xgb.train() --
Important question: what is the relation between these two xgb.train() & xgb.XGBClassifier()?
This is important because I can only do hyperparameter tuning on the latter AND I can only alter num_rounds on the former (which is necessary for a good model, it seems). Any thoughts?

In [65]:
# X = train.values
# y = target.values
#https://www.kaggle.com/c/springleaf-marketing-response/forums/t/16627/help-with-xgboost-sklearn-randomized-grid-search
# -*- coding: utf-8 -*-
"""
"""
t0 = time.time()
#http://scikit-learn.org/stable/auto_examples/model_selection/randomized_search.html
#http://scikit-learn.org/stable/modules/generated/sklearn.grid_search.RandomizedSearchCV.html
param_grid = {'max_depth': range(4,15),
#                         'min_child_weight': [1,40],
                      'objective':['binary:logistic'],
#                       'n_estimators':[5],
                      'learning_rate':[0.01], #this is same as eta
                      'subsample': np.arange(0.1,1.1,0.1),
                      'colsample_bytree': np.arange(0.1,1.1,0.1),
                      #'scale_pos_weight': [0.5, 1]
                      #'model__eta':[0.01,0.02],
                     #'model__scale_pos_weight':[0.8,1.0]
                      #'model__silent':[1],
                      }


from sklearn.grid_search import GridSearchCV, RandomizedSearchCV
from sklearn import metrics

xgb_model = xgb.XGBClassifier()
n_iter_search=20
random_search = RandomizedSearchCV(xgb_model, param_distributions=param_grid,
                                   n_iter=n_iter_search, scoring ="log_loss")

# start = time()
# training and y_training are 
# small dataset and target variable that I generated from the training dataset
random_search.fit(train, target) 
t1 = time.time()
total_time = t1 - t0
print total_time

print report(random_search.grid_scores_)
xgb_model_best = xgb.XGBClassifier()
xgb_model_best.set_params(**random_search.best_params_)
#http://stackoverflow.com/questions/34674797/xgboost-xgbclassifier-defaults-in-python
xgb_model_best.fit(X , y)
preds = xgb_model_best.predict_proba(xtest.values)[:,1]
#also see this! https://www.kaggle.com/c/airbnb-recruiting-new-user-bookings/forums/t/18494/gridsearchcv-on-xgboost/105272

5031.24653196
Model with rank: 1
Mean validation score: -0.515 (std: 0.000)
Parameters: {'objective': 'binary:logistic', 'subsample': 0.80000000000000004, 'learning_rate': 0.01, 'colsample_bytree': 0.90000000000000002, 'max_depth': 11}

Model with rank: 2
Mean validation score: -0.516 (std: 0.000)
Parameters: {'objective': 'binary:logistic', 'subsample': 0.40000000000000002, 'learning_rate': 0.01, 'colsample_bytree': 1.0, 'max_depth': 12}

Model with rank: 3
Mean validation score: -0.516 (std: 0.000)
Parameters: {'objective': 'binary:logistic', 'subsample': 0.5, 'learning_rate': 0.01, 'colsample_bytree': 0.90000000000000002, 'max_depth': 13}

None


In [66]:
import csv
predictions_file = open("xgb_rgs_larger_predictions.csv", "w")
open_file_object = csv.writer(predictions_file)
open_file_object.writerow(["ID", "PredictedProb"])
open_file_object.writerows(zip(IDs, preds))
predictions_file.close()

This above performed ok (logloss = -.53791) but not as well as other people's xgbtrain() w/ a large num_rounds. For example, see:
https://www.kaggle.com/director/bnp-paribas-cardif-claims-management/simple-xgboost-0-46146/code

In [67]:
#https://www.kaggle.com/mpearmain/homesite-quote-conversion/xgboost-benchmark
#https://www.kaggle.com/c/springleaf-marketing-response/forums/t/17089/beating-the-benchmark/96855
#https://github.com/lenguyenthedat/kaggle-for-fun/blob/master/springleaf-marketing-response/springleaf-xgb-native.py

So now I'll try using the best parameters for xgb.XGBClassifier() in xgb.train() AND make num_boost_round = 200.

In [None]:
#cf https://www.kaggle.com/director/bnp-paribas-cardif-claims-management/simple-xgboost-0-46146/code
t0 = time.time()
xgtrain = xgb.DMatrix(train.values, target.values)
xgtest = xgb.DMatrix(test.values)

#Now let's fit the model
print('Fit the model...')
boost_round = 2000 #1800 CHANGE THIS BEFORE START
clf = xgb.train(random_search.best_params_,xgtrain,num_boost_round=boost_round,verbose_eval=True,maximize=False)

#Make predict
print('Predict...')
preds = clf.predict(xgtest, ntree_limit=clf.best_iteration )
##check here for eval metrics + https://github.com/dmlc/xgboost/blob/master/demo/guide-python/evals_result.py
t1 = time.time()
total_time = t1 - t0
print total_time

In [None]:
import csv
predictions_file = open("xgb_rgs_more_rounds_predictions.csv", "w")
open_file_object = csv.writer(predictions_file)
open_file_object.writerow(["ID", "PredictedProb"])
open_file_object.writerows(zip(IDs, preds))
predictions_file.close()

This performed well: logloss = -0.45991 . 