# Home Credit - Gradient Boosting
This model is based on [light GBM model](https://lightgbm.readthedocs.io). Some additional feature engineering is performed. For brevity, these are in a separate utils python class. These currenlty extract the data from the other data sources performing aggregations, encondings etc. then merging with the training / test data sets. The engineered data is then fed to the gradient boosting model. Data is split into cross folds and an ROC score calculated.

In [2]:
import os, sys
import numpy as np
from matplotlib import pyplot as plt

import pandas as pd

from sklearn.metrics import roc_auc_score, precision_recall_curve, roc_curve, average_precision_score
from sklearn.model_selection import KFold
from lightgbm import LGBMClassifier

sys.path.append("../src")

from data.pre_process import *
from lightgbm_utils import *

%matplotlib inline

In [3]:
# Init some useful dirs
current_dir = os.getcwd()
DATA_HOME_DIR = current_dir+'/../data/raw/'

## Data

In [4]:
pd.options.display.max_columns = None

In [5]:
df_train_pre, df_test_pre, y = load_train_test_data(DATA_HOME_DIR)

In [6]:
df_train_pre.shape

(307511, 121)

In [7]:
df_train, df_test = load_data_dummies(df_train_pre, df_test_pre)
df_train, df_test = append_poly_feature(in_dir=DATA_HOME_DIR, df_train=df_train, df_test=df_test)
df_train, df_test = append_bureau_data(in_dir=DATA_HOME_DIR, df_train=df_train, df_test=df_test)
df_train, df_test = append_previous_applications(in_dir=DATA_HOME_DIR, df_train=df_train, df_test=df_test)
df_train, df_test = append_pos_data(in_dir=DATA_HOME_DIR, df_train=df_train, df_test=df_test)
df_train, df_test = append_credit_card_data(in_dir=DATA_HOME_DIR, df_train=df_train, df_test=df_test)
df_train, df_test = append_installments_data(in_dir=DATA_HOME_DIR, df_train=df_train, df_test=df_test)



In [8]:
df_train.shape

(307511, 831)

In [9]:
df_test.shape

(48744, 831)

In [10]:
y.shape

(307511,)

# Split data
Run algorithm using cross folds

In [11]:
feats = [f for f in df_train.columns if f not in ['SK_ID_CURR']]

In [12]:
folds = KFold(n_splits=5, shuffle=True)#, random_state=42) # TODO Remove random seed - only for testing consistency

# The Model
Now run the light GBM model using the cross folds. First the model. 

TODO: Plugin optunity here...

In [13]:
# The hyper parameters
EARLY_STOPPING_ROUNDS = 250
args = {
    "n_estimators": 4000,
    "learning_rate": 0.03,
    "num_leaves": 30,
    "colsample_bytree": 0.8,
    "subsample": 0.9,
    "max_depth": 6,
    "max_bin": 1024,
    "num_iterations": 1000,
    "min_data_in_leaf": 20,
    "reg_alpha": 0.1,
    "reg_lambda": 0.1,
    "min_split_gain": 0.01,
    "min_child_weight": 2,
    "silent": -1,
    "verbose": -1,
   # "objective": "regression",
   # "metric": "",
    "objective": "binary",
    "metric": "binary_log_loss",
    "bagging_fraction": 0.9,
    "bagging_freq": 15,
    "lambda_l1": 0.0,
    "lambda_l2": 0.0,
    "min_gain_to_split": 0.0,
    "feature_fraction": 1.0
}

The lighgbm file provides a utility class to run a cross fold / lightgbm model. See docs of that method for details.

In [None]:
df_fold_preds_train, df_fold_preds_test, df_feature_importance = \
    run_lightgbm_model(df_train, df_test, y, folds, feats, early_stopping=EARLY_STOPPING_ROUNDS, args_dict=args)
                       #save_model=True, file_prefix="m1_nl35")



In [None]:
args["num_leaves"]=64
args["max_depth"]=7
print(args)

In [None]:
df_fold_preds_train, df_fold_preds_test, df_feature_importance = \
    run_lightgbm_model(df_train, df_test, y, folds, feats, early_stopping=EARLY_STOPPING_ROUNDS, args_dict=args)
                       #save_model=True, file_prefix="m1_nl35")

In [None]:
#args["num_leaves"]=64
args["boosting"]="dart"
args["drop_rate"]=0.1
args["learning_rate"]=0.03
EARLY_STOPPING_ROUNDS=200
print(args)

In [1]:
df_fold_preds_train, df_fold_preds_test, df_feature_importance = \
    run_lightgbm_model(df_train, df_test, y, folds, feats, early_stopping=EARLY_STOPPING_ROUNDS, args_dict=args)
                       #save_model=True, file_prefix="m1_nl35")

NameError: name 'run_lightgbm_model' is not defined

### Submission

In [None]:
df_submission = df_test[['SK_ID_CURR']]
df_submission['TARGET'] = df_fold_preds_test
df_submission.to_csv('lgbm_submission3.csv', index=False)