# HOME CREDIT - XGBOOT - No merge Data

## 1. Import Library and read data

In [1]:
import pandas as pd 
import numpy as np 

import matplotlib.pyplot as plt # for plotting
import seaborn as sns # for making plots with seaborn
color = sns.color_palette()

import cufflinks as cf
cf.go_offline()

from sklearn.preprocessing import LabelEncoder

import time

notebook = time.time()

In [2]:
# read training data
application_train = pd.read_csv('../input/application_train.csv')

In [3]:
# read testing data
application_test = pd.read_csv('../input/application_test.csv')

## 2. Handling training data and testing data

In [5]:
application_train['TARGET'].value_counts()

0    282686
1     24825
Name: TARGET, dtype: int64

In [7]:
application_train.dtypes.value_counts()

float64    65
int64      41
object     16
dtype: int64

## 3. Create label encoder object

In [8]:
le = LabelEncoder()
le_count = 0

# Iterate through the columns
for col in application_train:
    if application_train[col].dtype == 'object':
        # If 2 or fewer unique categories
        if len(list(application_train[col].unique())) <= 2:
            # Train on the training data
            le.fit(application_train[col])
            # Transform both training and testing data
            application_train[col] = le.transform(application_train[col])
            application_test[col] = le.transform(application_test[col])
            
            # Keep track of how many columns were label encoded
            le_count += 1
            
print('%d columns were label encoded.' % le_count)

3 columns were label encoded.


## 4. One-hot encoding of categorical variables

In [9]:
application_train = pd.get_dummies(application_train)
application_test = pd.get_dummies(application_test)

print('Training Features shape: ', application_train.shape)
print('Testing Features shape: ', application_test.shape)

Training Features shape:  (307511, 243)
Testing Features shape:  (48744, 239)


## 5. Aligning Training and Testing Data

In [11]:
train_labels = application_train['TARGET']

# Align the training and testing data, keep only columns present in both dataframes
application_train, application_test = application_train.align(application_test, join = 'inner', axis = 1)

# Add the target back in
application_train['TARGET'] = train_labels

print('Training Features shape: ', application_train.shape)
print('Testing Features shape: ', application_test.shape)

Training Features shape:  (307511, 240)
Testing Features shape:  (48744, 239)


In [12]:
# Drop the target from the training data
if 'TARGET' in application_train:
    train = application_train.drop(columns = ['TARGET'])
else:
    train = application_train.copy()

In [16]:
test = application_test.copy()

In [18]:
print('Training Features shape: ', application_train.shape)
print('Testing Features shape: ', test.shape)

Training Features shape:  (307511, 240)
Testing Features shape:  (48744, 239)


In [19]:
train_y = train_labels
train_X = train

In [20]:
print('Training Features shape: ', train_y.shape)
print('Training Features shape: ', train_X.shape)
print('Testing Features shape: ', test.shape)

Training Features shape:  (307511,)
Training Features shape:  (307511, 239)
Testing Features shape:  (48744, 239)


In [11]:
print("\nInitial runtime: %0.2f Minutes"%((time.time() - notebook)/60))


Initial runtime: 0.37 Minutes


In [None]:
# Initial runtime: 0.37 Minutes

## 6. XGBoot Algorithm

In [8]:
from xgboost import XGBClassifier 
#from sklearn.model_selection import train_test_split 
#from sklearn.metrics import accuracy_score
#from sklearn.datasets import load_breast_cancer

In [9]:
#from sklearn.preprocessing import Imputer

### XGBoot Regressor

In [12]:
import time

In [13]:
from xgboost import XGBRegressor

modelstart = time.time()

#my_model = XGBRegressor()
"""my_model = XGBRegressor(objective = 'binary:logistic',
          booster = 'gbtree',
          eval_metric = 'auc',
          nthread = 4,
          eta = 0.05,
          max_depth = 6,
          min_child_weight = 30,
          gamma = 0,
          subsample = 0.85,
          colsample_bytree = 0.7,
          colsample_bylevel = 0.632,
          reg_alpha = 0,
          reg_lambda = 0,
          nrounds = 2000)"""

my_model = XGBRegressor(objective = 'binary:logistic',
          booster = 'gbtree',
          eval_metric = 'auc',
          nthread = 4,
          eta = 0.05,
          max_depth = 8,
          min_child_weight = 60,
          gamma = 0,
          subsample = 0.8715623,
          colsample_bytree = 0.7,
          colsample_bylevel = 0.632,
          reg_alpha = 0.041545473,
          reg_lambda = 0.0735294,
          nrounds = 2000)
# Add silent=True to avoid printing out updates with each cycle
my_model.fit(train_X, train_y, verbose=False)


Series.base is deprecated and will be removed in a future version


Series.base is deprecated and will be removed in a future version



XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=0.632,
       colsample_bytree=0.7, eta=0.05, eval_metric='auc', gamma=0,
       importance_type='gain', learning_rate=0.1, max_delta_step=0,
       max_depth=8, min_child_weight=60, missing=None, n_estimators=100,
       n_jobs=1, nrounds=2000, nthread=4, objective='binary:logistic',
       random_state=0, reg_alpha=0.041545473, reg_lambda=0.0735294,
       scale_pos_weight=1, seed=None, silent=True, subsample=0.8715623)

In [28]:
#test = application_test.copy()

In [15]:
# make predictions
predictions = my_model.predict(test)

In [16]:
# Submission dataframe
submit = application_test[['SK_ID_CURR']]
submit['TARGET'] = predictions

In [17]:
# Save the submission to a csv file
submit.to_csv('../output/log_xgbootRegRessor_noMergeData_V2.csv', index = False)

In [18]:
print("\nModel Runtime: %0.2f Minutes"%((time.time() - modelstart)/60))


Model Runtime: 4.23 Minutes


In [73]:
# Notebook Time: 3.89 minutes
# Total time: 3.52 Minutes
# Score: 0.74323

### XGBoost DMatrix

In [15]:
import xgboost as xgb

In [16]:
feat_names = train_X.columns

In [17]:
d_train = xgb.DMatrix(train_X,train_y,feature_names=feat_names)


Series.base is deprecated and will be removed in a future version


Series.base is deprecated and will be removed in a future version



In [18]:
test = application_test.copy()
d_test = xgb.DMatrix(test,feature_names=feat_names)

In [21]:
import time

In [24]:
#xgb_params = {'eta': 0.01, 
#              'max_depth': 6, 
#              'subsample': 0.8, 
#              'colsample_bytree': 0.632,
#              'min_child_weight' : 35,
#              #'scale_pos_weight': ,
#              'objective': 'binary:logistic', 
#              'eval_metric': 'auc', 
#              'seed': 23,
#              'lambda': 0.50,
#              'alpha': 0.25,
#              'silent': 1
#             }

xgb_params = {'objective': 'binary:logistic',
          'booster': 'gbtree',
          'eval_metric': 'auc',
          'nthread': 4,
          'eta': 0.05,
          'max_depth': 6, # 8
          'min_child_weight': 30, #60
          'gamma': 0,
          'subsample': 0.85, # 0.8715623
          'colsample_bytree': 0.7,
          'colsample_bylevel': 0.632,
          'alpha': 0, #'reg_alpha': 0.041545473
          'lambda': 0, #'reg_lambda': 0.0735294
          'nrounds': 2000
             }

In [26]:

n_rounds = 2000
modelstart = time.time()
watchlist = [(d_train, 'train')]
model = xgb.train(xgb_params, d_train, n_rounds, watchlist, verbose_eval=150, early_stopping_rounds=200)


[0]	train-auc:0.717151
Will train until train-auc hasn't improved in 200 rounds.
[150]	train-auc:0.779982
[300]	train-auc:0.796482
[450]	train-auc:0.808184
[600]	train-auc:0.81839
[750]	train-auc:0.827434
[900]	train-auc:0.835922
[1050]	train-auc:0.843154
[1200]	train-auc:0.849718
[1350]	train-auc:0.85586
[1500]	train-auc:0.86216
[1650]	train-auc:0.867976
[1800]	train-auc:0.873043
[1950]	train-auc:0.878085
[1999]	train-auc:0.879557


In [31]:
xgb_pred = model.predict(d_test)

In [38]:
# Save the submission to a csv file
testdex = application_test.SK_ID_CURR

xgb_sub = pd.DataFrame(xgb_pred,columns=["TARGET"],index=testdex)
xgb_sub.index.rename("SK_ID_CURR",inplace=True)
xgb_sub.to_csv("log_xgbootDMatrix_noMergeData.csv",index=True,float_format='%.8f')

In [29]:
print("\nModel Runtime: %0.2f Minutes"%((time.time() - modelstart)/60))


Model Runtime: 62.10 Minutes
Notebook Runtime: 66.78 Minutes


In [None]:
# Notebook Time: 62.47 minutes
# Total time: 62.10 Minutes
# Score: 0.74385

### LightGBM

In [43]:
import lightgbm as lgbm
modelstart = time.time()

testdex = application_test.SK_ID_CURR

In [44]:
# LightGBM parameters found by Bayesian optimization
lgb_params = {
    'objective': 'binary',
    'boosting_type': 'gbdt',
    'nthread': 4,
    'learning_rate': 0.02,  # 02,
    'num_leaves': 20,
    'colsample_bytree': 0.9497036,
    'subsample': 0.8715623,
    'subsample_freq': 1,
    'max_depth': 8,
    'reg_alpha': 0.041545473,
    'reg_lambda': 0.0735294,
    'min_split_gain': 0.0222415,
    'min_child_weight': 60, # 39.3259775,
    'seed': 0,
    'verbose': -1,
    'metric': 'auc',
}


In [45]:
lgb_train = lgbm.Dataset(
                 train_X, 
                 train_y)

In [46]:
lgb = lgbm.train(lgb_params, lgb_train)

In [47]:
gmb_pred = lgb.predict(test)

In [48]:
gbm_sub = pd.DataFrame(gmb_pred,columns=["TARGET"],index=testdex)
gbm_sub.index.rename("SK_ID_CURR",inplace=True)
gbm_sub.to_csv("../output/log_gbm_noMergeData.csv",index=True,float_format='%.8f')

In [49]:
print("\nModel Runtime: %0.2f Minutes"%((time.time() - modelstart)/60))


Model Runtime: 0.35 Minutes


In [50]:
# Notebook Time: 0.72 Minutes
# Time: 0.35 Minutes
# Score: 0.72983

## To be continued... !!!