# SETTINGS

In [3]:
# libraries
import pandas as pd
import numpy as np
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split
from lightgbm import LGBMClassifier
import gc

In [4]:
# garbage collection
gc.enable()

# DATA PREPARATION

In [5]:
# import data
train = pd.read_csv("../data/raw/application_train.csv")
test  = pd.read_csv("../data/raw/application_test.csv")
prev  = pd.read_csv("../data/raw/previous_application.csv")
buro  = pd.read_csv("../data/raw/bureau.csv")

In [6]:
# extract target
y = train["TARGET"]
del train["TARGET"]

In [7]:
##### CONVERT FACTOR FEATURES
    
### prev
prev_cat_features = [f for f in prev.columns if prev[f].dtype == "object"]
for f in prev_cat_features:
    prev[f], _ = pd.factorize(prev[f])
    
### buro
buro_cat_features = [f for f in buro.columns if buro[f].dtype == "object"]
for f in buro_cat_features:
    buro[f], _ = pd.factorize(buro[f])
    
### train and test
cat_feats = [f for f in train.columns if train[f].dtype == "object"]
for f in cat_feats:
    train[f], indexer = pd.factorize(train[f])
    test[f] = indexer.get_indexer(test[f])

In [8]:
##### AGGREGATE FEATURES

### prev
avg_prev = prev.groupby('SK_ID_CURR').mean()
cnt_prev = prev[['SK_ID_CURR', 'SK_ID_PREV']].groupby('SK_ID_CURR').count()
avg_prev['nb_app'] = cnt_prev['SK_ID_PREV']
del avg_prev['SK_ID_PREV']

### buro
avg_buro = buro.groupby('SK_ID_CURR').mean()
avg_buro['buro_count'] = buro[['SK_ID_BUREAU','SK_ID_CURR']].groupby('SK_ID_CURR').count()['SK_ID_BUREAU']
del avg_buro['SK_ID_BUREAU']

In [9]:
##### MERGE DATA

# data
train = train.merge(right = avg_prev.reset_index(), how = 'left', on = 'SK_ID_CURR')
train = train.merge(right = avg_buro.reset_index(), how = 'left', on = 'SK_ID_CURR')

# test
test = test.merge(right = avg_prev.reset_index(), how = 'left', on = 'SK_ID_CURR')
test = test.merge(right = avg_buro.reset_index(), how = 'left', on = 'SK_ID_CURR')

In [10]:
# exclude features
excluded_feats = ["SK_ID_CURR"]
features = [f for f in train.columns if f not in excluded_feats]

In [11]:
# check dimensions
print(train.shape)
print(test.shape)

(307511, 173)
(48744, 173)


In [12]:
# data partitioning
dtrain, dvalid, y_train, y_valid = train_test_split(train, y, test_size = 0.30, random_state = 42)
print(dtrain.shape)
print(dvalid.shape)

(215257, 173)
(92254, 173)


# MODELING

In [16]:
### PARAMETERS

# cross-validation
num_folds = 5
seed = 42

# lightGBM
lgb = LGBMClassifier(n_estimators     = 1000,
                     learning_rate    = 0.005,
                     num_leaves       = 70,
                     colsample_bytree = 0.8,
                     subsample        = 0.9,
                     max_depth        = 7,
                     reg_alpha        = 0.1,
                     reg_lambda       = 0.1,
                     min_split_gain   = 0.01,
                     min_child_weight = 2)

# learner settings
metric   = "auc"
verbose  = 50
stopping = 10

In [17]:
### PREPARATIONS

# data partitinoing
folds = KFold(n_splits = num_folds, 
              random_state = seed,
              shuffle = True)

# placeholders
valid_aucs = np.zeros(num_folds) 
test_preds = np.zeros(test.shape[0])

In [19]:
### CROSS-VALIDATION LOOP
for n_fold, (trn_idx, val_idx) in enumerate(folds.split(train)):
    
    # data partitioning
    trn_x, trn_y = train[features].iloc[trn_idx], y.iloc[trn_idx]
    val_x, val_y = train[features].iloc[val_idx], y.iloc[val_idx]
    
    # train lightGBM
    lgb.fit(trn_x, trn_y, 
            eval_set = [(trn_x, trn_y), (val_x, val_y)], 
            eval_metric = metric, 
            verbose = verbose, 
            early_stopping_rounds = stopping)
    
    # predictions
    valid_preds = lgb.predict_proba(val_x, num_iteration = lgb.best_iteration_)[:, 1]
    valid_aucs[n_fold] = roc_auc_score(val_y, valid_preds)
    test_preds += lgb.predict_proba(test[features], num_iteration = lgb.best_iteration_)[:, 1] / folds.n_splits
    
    # print performance
    print("-----------------------")
    print("Fold%2d AUC: %.6f" % (n_fold + 1, valid_aucs[n_fold]))
    print("-----------------------")
    print("")

    # clear memory
    del trn_x, trn_y, val_x, val_y
    gc.collect()
    
# print overall performance    
print("Cross-Validation AUC score %.6f" % np.mean(valid_aucs))  

Training until validation scores don't improve for 10 rounds.
Early stopping, best iteration is:
[21]	training's auc: 0.745636	valid_1's auc: 0.734939
-----------------------
Fold 1 AUC: 0.734939
-----------------------

Training until validation scores don't improve for 10 rounds.
Early stopping, best iteration is:
[26]	training's auc: 0.752832	valid_1's auc: 0.740177
-----------------------
Fold 2 AUC: 0.740177
-----------------------

Training until validation scores don't improve for 10 rounds.
Early stopping, best iteration is:
[21]	training's auc: 0.746338	valid_1's auc: 0.734224
-----------------------
Fold 3 AUC: 0.734224
-----------------------

Training until validation scores don't improve for 10 rounds.
Early stopping, best iteration is:
[21]	training's auc: 0.753588	valid_1's auc: 0.737187
-----------------------
Fold 4 AUC: 0.737187
-----------------------

Training until validation scores don't improve for 10 rounds.
Early stopping, best iteration is:
[26]	training's auc

ValueError: Found input variables with inconsistent numbers of samples: [307511, 61502]

# SUBMISSION

In [136]:
# create submission
test["TARGET"] = test_preds
subm = test[["SK_ID_CURR", "TARGET"]]

# export CSV
subm.to_csv("../submissions/bagged_lgbm.csv", index = False, float_format = "%.8f")