# Gradient Boosting - Expedia Kaggle challenge

In [62]:
import numpy as np
import pandas as pd
import random
from datetime import datetime
import matplotlib.pyplot as plt

import xgboost as xgb
from sklearn.preprocessing      import LabelEncoder
from sklearn.preprocessing      import OneHotEncoder
from sklearn.cross_validation   import StratifiedKFold

from sklearn.metrics import log_loss
from sklearn.metrics import roc_auc_score

# This gives us the "Mean average precision" function
import ml_metrics as metrics 


import sys
sys.path.append("..") # Adds higher directory to python modules path.
import ml_utilities

%matplotlib inline

In [49]:
train = pd.read_csv("./train.csv", nrows = 10000)
target = train['hotel_cluster']
train = train.drop(['hotel_cluster'],axis=1)
test = pd.read_csv("./test.csv", nrows = 1000)

feat_names = test.columns.values
dest = pd.read_csv("./destinations.csv")

obj_feat = train.select_dtypes(include=[object]).keys()

# We convert the date columns in to columns indicating day / month / year
for col in obj_feat:

    tr_date_series = pd.DatetimeIndex(train[col])
    te_date_series = pd.DatetimeIndex(test[col])

    train[col + '_day'] = tr_date_series.day
    test[col + '_day'] = te_date_series.day
    train[col + '_month'] = tr_date_series.month
    test[col + '_month'] = te_date_series.month
    train[col + '_year'] = tr_date_series.year
    test[col + '_year'] = te_date_series.year

    train = train.drop([col], axis=1)
    test = test.drop([col], axis=1)

train.head()
print(obj_feat)
print(train.keys())
print("training set shape: {0}x{1}, test set shape: {2}x{3}".format(train.shape[0], train.shape[1],test.shape[0], test.shape[1]))


Index(['date_time', 'srch_ci', 'srch_co'], dtype='object')
Index(['site_name', 'posa_continent', 'user_location_country',
       'user_location_region', 'user_location_city',
       'orig_destination_distance', 'user_id', 'is_mobile', 'is_package',
       'channel', 'srch_adults_cnt', 'srch_children_cnt', 'srch_rm_cnt',
       'srch_destination_id', 'srch_destination_type_id', 'is_booking', 'cnt',
       'hotel_continent', 'hotel_country', 'hotel_market', 'date_time_day',
       'date_time_month', 'date_time_year', 'srch_ci_day', 'srch_ci_month',
       'srch_ci_year', 'srch_co_day', 'srch_co_month', 'srch_co_year'],
      dtype='object')
training set shape: 10000x29, test set shape: 1000x28


In [64]:
train = ml_utilities.randomised_imputer(train)

In [65]:
Y = target.values  #train.target.values.astype(np.int32)
X = train.values      #train[ [ "VAR_0001", "VAR_0005", "VAR_0006", "VAR_0226"] ].values

params = {
    "max_depth"             : 5, 
    "eta"                   : 0.1,
    "min_eta"               : 0.00001,
    "eta_decay"             : 0.5,
    "max_fails"             : 3,
    "early_stopping_rounds" : 20,
    "objective"             : 'rank:pairwise',
    "subsample"             : 0.8, 
    "colsample_bytree"      : 1.0,
    "n_jobs"                : -1,
    "n_estimators"          : 5000, 
    "silent"                : 1,
    "gamma"                 : 0.1,
    "min_child_weight"      : 1.1
    }

print("\nWithout decay ...\n")
best_model_nd = do_train(X, Y, params)

dte = xgb.DMatrix(te.values)
te_pred_nd = best_model_nd.predict(dte, ntree_limit=best_model_nd.best_iteration+1)



Without decay ...



Will train until valid error hasn't decreased in 20 rounds.
[0]	train-map:0.991753	valid-map:0.993766
[1]	train-map:0.991724	valid-map:0.994342
[2]	train-map:0.992096	valid-map:0.994519
[3]	train-map:0.992900	valid-map:0.994409
[4]	train-map:0.993030	valid-map:0.994252
[5]	train-map:0.993338	valid-map:0.994339
[6]	train-map:0.992913	valid-map:0.993285
[7]	train-map:0.993095	valid-map:0.993722
[8]	train-map:0.992947	valid-map:0.993613
[9]	train-map:0.993071	valid-map:0.993470
[10]	train-map:0.992879	valid-map:0.993766
[11]	train-map:0.993035	valid-map:0.993902
[12]	train-map:0.992873	valid-map:0.993878
[13]	train-map:0.993129	valid-map:0.993875
[14]	train-map:0.993171	valid-map:0.993864
[15]	train-map:0.993350	valid-map:0.994135
[16]	train-map:0.993337	valid-map:0.994395
[17]	train-map:0.993332	valid-map:0.994410
[18]	train-map:0.993098	valid-map:0.994068
[19]	train-map:0.993184	valid-map:0.994064
[20]	train-map:0.993320	valid-map:0.994060
[21]	train-map:0.993286	valid-map:0.994061
[22]

ValueError: y_true and y_pred have different number of classes 100, 2

In [None]:
output_name = "pred_cv_{0}_{1}_{2}_{3}_{4}".format(params["eta"], params["n_estimators"], 
                                                params["max_depth"], params["gamma"], params["min_child_weight"])

# Save results
predictions_file = open(output_name + ".csv", "w")
open_file_object = csv.writer(predictions_file)
open_file_object.writerow(["ID", "PredictedProb"])
open_file_object.writerows(zip(IDs, te_pred_nd))
predictions_file.close()

### The training routine - execute this cell before the others to make it all work!

In [51]:
def eval_mapk(target, predictions):
    
    targ = [[l] for l in target]
    pred = [[p] for p in predictions]
    metrics.mapk(targ, pred, k=5)

def do_train(X, Y, params, verbose=False):
    ''' Trains a model with inputs X against predictor variable Y, with CV '''
    np.random.seed(1)
    random.seed(1)

    cv_scores = []
    train_scores = []
    
    split = StratifiedKFold(Y, 5, shuffle=True )
    fold = 0
    
    for train_index, cv_index in split:
    
        fold = fold + 1
                    
        X_train, X_valid    = X[train_index,:], X[cv_index,:]
        y_train, y_valid    = Y[train_index],   Y[cv_index]
    
        num_round       = params["n_estimators"]
        eta             = params["eta"]
        min_eta         = params["min_eta"]
        eta_decay       = params["eta_decay"]
        early_stop      = params["early_stopping_rounds"]
        max_fails       = params["max_fails"]
        
        params_copy     = dict(params)
        
        dtrain          = xgb.DMatrix( X_train, label=y_train ) 
        dvalid          = xgb.DMatrix( X_valid, label=y_valid )  
    
        total_rounds        = 0
        best_rounds         = 0
        pvalid              = None
        model               = None
        best_train_score    = None
        best_cv_score       = None
        fail_count          = 0
        best_rounds         = 0
        best_model          = None
        
        while eta >= min_eta:           
            
            model        = xgb.train( params_copy.items(), 
                                      dtrain, 
                                      num_round, 
                                      [(dtrain, 'train'), (dvalid,'valid')], 
                                      early_stopping_rounds=early_stop)
                #,
                #                      feval=evallogloss )
    
            rounds          = model.best_iteration + 1
            total_rounds   += rounds
            
            train_score = log_loss( y_train, model.predict(dtrain, ntree_limit=rounds) )
            cv_score    = log_loss( y_valid, model.predict(dvalid, ntree_limit=rounds) )
    
            if best_cv_score is None or cv_score > best_cv_score:
                fail_count = 0
                best_train_score = train_score
                best_cv_score    = cv_score
                best_rounds      = rounds
                best_model       = model

                ptrain           = best_model.predict(dtrain, ntree_limit=rounds, output_margin=True)
                pvalid           = best_model.predict(dvalid, ntree_limit=rounds, output_margin=True)
                
                dtrain.set_base_margin(ptrain)
                dvalid.set_base_margin(pvalid)
            else:
                fail_count += 1

                if fail_count >= max_fails:
                    break
    
            eta                 = eta_decay * eta
            params_copy["eta"]  = eta
    
        train_scores.append(best_train_score)
        cv_scores.append(best_cv_score)

        print("Fold [%2d] %9.6f : %9.6f" % ( fold, best_train_score, best_cv_score ))
        
    print("-------------------------------")
    print("Mean      %9.6f : %9.6f" % ( np.mean(train_scores), np.mean(cv_scores) ) )
    print("Stds      %9.6f : %9.6f" % ( np.std(train_scores),  np.std(cv_scores) ) )
    print("-------------------------------")
      
    return best_model

# ----------------------------f----------------------------------------------------
#
