In [1]:
%matplotlib notebook
import pandas as pd
import numpy as np

import csv
import os
import sys

from sklearn.cross_validation import cross_val_score, train_test_split
from sklearn.externals import joblib
from sklearn.metrics import log_loss
import xgboost as xgb

from bayes_opt import BayesianOptimization

import matplotlib.pyplot as plt

In [2]:

# XGBoost params:
xgboost_params = { 
   "objective": "binary:logistic",
   "booster": "gbtree",
   "eval_metric": ["auc","error","map","logloss"],
   "eta": 0.01, # 0.06, #0.01,
   #"min_child_weight": 240,
   "subsample": 0.75,
   "colsample_bytree": 0.68,
   "max_depth": 7
}

print('Load data...')
train = pd.read_csv('data/train.csv')
target = train['target']
train = train.drop(['ID','target'],axis=1)
test = pd.read_csv('data/test.csv')
ids = test['ID'].values
test = test.drop(['ID'],axis=1)
#




Load data...


In [3]:
# Define a function, dummify, that will replace categorical features with dummy columns. Return the new dataset,
# the names of the dummy columns, and the rows with null values for each categorical variable
def dummify(name,series):
    prefix_string = name + '_'
    dummies = pd.get_dummies(series,prefix=prefix_string)
    dummy_column_names = dummies.columns.values
    #Get a list of all rows containing nulls. After dummifying these rows will just have all zeros for dummy variable
    get_nulls = np.where(series.isnull() == True)[0].tolist()

    return dummies, dummy_column_names, get_nulls

In [4]:
# Drop features with > 0.9 correlation. Keep feature with fewest NaNs

corr = train.corr()

to_drop = set()
# for col in corr.columns.values:
#     list_correlated = corr[col][(corr[col] > 0.9) & (corr[col] < 1)].index.tolist()
#     if len(list_correlated) > 0:
#         sys.stdout.write(col + ": ")
#         print list_correlated
for col in corr.columns.values:
    if col in to_drop:
        continue

    col_list = corr[col][(corr[col] > 0.9) & (corr[col] < 1)].index.tolist()
    col_set = set(col_list)
    col_set.difference_update(to_drop)
    if (len(col_list) == 0) or (len(col_set) == 0):
        continue

    col_list.append(col)
    lowest_na_count = train[col_list[0]].isnull().sum()
    best_col = col_list[0]
    for option in col_list:
        na_count = train[option].isnull().sum()
        if na_count < lowest_na_count:
            lowest_na_count = na_count
            best_col = option
    col_list.remove(best_col)
#     print 'dropping = ' + str(col_list)
#     print 'keeping = ' + str(best_col)
    to_drop.update(col_list)


In [5]:
print "Adding features..."
train['NA_num'] = train.isnull().sum(axis=1)
test['NA_num'] = test.isnull().sum(axis=1)

Adding features...


In [6]:


drop_correlated = True

if drop_correlated == True:
    print 'Drop Correlated ...'
    train.drop(list(to_drop),axis=1, inplace = True)
    test.drop(list(to_drop),axis=1, inplace = True)

print('Clearing...')
train_dummy_columns = {}
train_nulls_dict = {}
test_dummy_columns = {}
test_nulls_dict = {}



cleaned_train = train.copy()
cleaned_test = test.copy()





for (train_name, train_series), (test_name, test_series) in zip(train.iteritems(),test.iteritems()):
    if train_name == 'v22':
        #v22 has too many options to dummify, instead: factorize
        cleaned_train[train_name], tmp_indexer = pd.factorize(train[train_name])
        cleaned_test[test_name] = tmp_indexer.get_indexer(test[test_name])
        #but now we have -1 values (NaN)    
    elif train_series.dtype == 'O':
        print 'Dummifying ' + train_name
        cleaned_train.drop(train_name,axis=1,inplace = True)
        cleaned_test.drop(train_name,axis=1,inplace = True)
        
        train_dummies, train_dummy_list, train_null_list = dummify(train_name,train_series)
        test_dummies, test_dummy_list, test_null_list = dummify(test_name,test_series)

        cleaned_train = pd.concat([cleaned_train,train_dummies], axis = 1)
        cleaned_test = pd.concat([cleaned_test,test_dummies], axis = 1)

        train_dummy_columns[train_name] = train_dummy_list
        train_nulls_dict[train_name] = train_null_list
        test_dummy_columns[test_name] = test_dummy_list
        test_nulls_dict[test_name] = test_null_list
        
    else:
        #for int or float: fill NaN
        tmp_len = len(train[train_series.isnull()])
        if tmp_len>0:
            cleaned_train.loc[train_series.isnull(), train_name] = train_series.mean()
        #and Test
        tmp_len = len(test[test_series.isnull()])
        if tmp_len>0:
            cleaned_test.loc[test_series.isnull(), test_name] = train_series.mean()  #TODO

Drop Correlated ...
Clearing...
Dummifying v3
Dummifying v24
Dummifying v30
Dummifying v31
Dummifying v47
Dummifying v52
Dummifying v56
Dummifying v66
Dummifying v71
Dummifying v74
Dummifying v75
Dummifying v79
Dummifying v91
Dummifying v107
Dummifying v110
Dummifying v112
Dummifying v113
Dummifying v125


In [7]:
# cleaned_train.isnull().sum(axis=1)
# with pd.option_context('display.max_rows', 999, 'display.max_columns', 3):
#    print cleaned_train.iloc[0]

In [8]:
X_train, X_valid, y_train, y_valid = train_test_split(cleaned_train.values, target.values, test_size=0.30, random_state = 1)

X_valid, X_itest, y_valid, y_itest = train_test_split(X_valid, y_valid, test_size=0.20, random_state = 1)

final = False

# xgtrain = xgb.DMatrix(train.values, target.values)
# xgtest = xgb.DMatrix(test.values)

xgtrain = xgb.DMatrix(X_train, y_train)
xgvalid = xgb.DMatrix(X_valid, y_valid)
xgtest = xgb.DMatrix(cleaned_test.values)

if final == True:
    xgtrain = xgb.DMatrix(cleaned_train.values, target.values)
    

#Now let's fit the model


In [None]:
# michael.pearmain's bayesboost from https://github.com/mpearmain/BayesBoost/blob/master/examples/otto_boost.py

def xgboostcv(max_depth,
              learning_rate,
              n_estimators,
              gamma,
              min_child_weight,
              max_delta_step,
              subsample,
              colsample_bytree,
              silent =True,
              nthread = -1,
              seed = 1234):
    return cross_val_score(xgb.XGBClassifier(max_depth = int(max_depth),
                                         learning_rate = learning_rate,
                                         n_estimators = int(n_estimators),
                                         silent = silent,
                                         nthread = nthread,
                                         gamma = gamma,
                                         min_child_weight = min_child_weight,
                                         max_delta_step = max_delta_step,
                                         subsample = subsample,
                                         colsample_bytree = colsample_bytree,
                                         seed = seed,
                                         objective = "binary:logistic"),
                           X_train,
                           y_train,
                           "log_loss",
                           cv=5).mean()


xgboostBO = BayesianOptimization(xgboostcv,
                                 {'max_depth': (5, 10),
                                  'learning_rate': (0.01, 0.3),
                                  'n_estimators': (50, 1000),
                                  'gamma': (1., 0.01),
                                  'min_child_weight': (2, 10),
                                  'max_delta_step': (0, 0.1),
                                  'subsample': (0.7, 0.8),
                                  'colsample_bytree' :(0.5, 0.99)
                                 })

xgboostBO.maximize()
print('-'*53)

print('Final Results')
print('XGBOOST: %f' % xgboostBO.res['max']['max_val'])

[31mInitialization[0m
[94m---------------------------------------------------------------------------------------------------------------------------------------------------------------------[0m
 Step |   Time |      Value |   colsample_bytree |     gamma |   learning_rate |   max_delta_step |   max_depth |   min_child_weight |   n_estimators |   subsample | 


In [None]:


# Train score:  0.377414274108
# Valid score:  0.459356080264
# Independent test set score:  0.460825726577