In [1]:
import os
import pickle

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from lightgbm import LGBMClassifier
from sklearn.metrics import roc_auc_score, roc_curve
from sklearn.model_selection import KFold, StratifiedKFold

In [2]:
new_df3 = pd.read_csv("customer_train_db.csv")
credit = pd.read_csv("customer_credits_db.csv")

In [3]:
new_df3.head(10)

Unnamed: 0,ID,Default,Gender,Own_Car,Own_Residence,Num_of_Child,Income,Amount_Credit,Annuity,Price,Region_Population_Factor,Birth_Day_CNT,Employed_Day_CNT,Registration_Day_CNT,Car_Age
0,446165,0,F,N,Y,0,247500.0,756000.0,22234.5,756000.0,0.026392,-17106,-8438,-7502,
1,150009,0,F,N,N,0,46566.0,450000.0,12375.0,450000.0,0.028663,-19454,-899,-3224,
2,247486,0,F,N,Y,0,76500.0,295168.5,15201.0,238500.0,0.010556,-16139,-2180,-3410,
3,164890,0,M,N,N,0,162000.0,900000.0,45954.0,900000.0,0.006008,-19430,-805,-4366,
4,353539,0,M,Y,Y,1,180000.0,445333.5,47961.0,423000.0,0.018801,-10681,-1921,-598,5.0
5,304225,0,F,N,N,0,126000.0,603000.0,24556.5,603000.0,0.035792,-11516,-4165,-5522,
6,105812,0,M,N,Y,0,144000.0,284400.0,13387.5,225000.0,0.010147,-10971,-426,-6267,
7,102316,0,F,Y,Y,2,270000.0,713889.0,43803.0,661500.0,0.072508,-15666,365243,-3215,5.0
8,410579,0,F,Y,Y,0,247500.0,769500.0,27391.5,769500.0,0.026392,-14247,-2436,-8292,5.0
9,127087,0,F,Y,N,0,94500.0,284400.0,13387.5,225000.0,0.031329,-15615,-3459,-4733,1.0


In [4]:
credit.head(n=10)

Unnamed: 0,ID,File_ID,Active_Credit,Currency,Credit_Days,Overdue_Days,Max_Overdue,Credit,Credit_Debt,Credit_Limit,Credit_Overdue,Type
0,215354,5714462,Closed,currency 1,-497,0,,91323.0,0.0,,0.0,Consumer credit
1,215354,5714463,Active,currency 1,-208,0,,225000.0,171342.0,,0.0,Credit card
2,215354,5714464,Active,currency 1,-203,0,,464323.5,,,0.0,Consumer credit
3,215354,5714465,Active,currency 1,-203,0,,90000.0,,,0.0,Credit card
4,215354,5714466,Active,currency 1,-629,0,77674.5,2700000.0,,,0.0,Consumer credit
5,215354,5714467,Active,currency 1,-273,0,0.0,180000.0,71017.38,108982.62,0.0,Credit card
6,215354,5714468,Active,currency 1,-43,0,0.0,42103.8,42103.8,0.0,0.0,Consumer credit
7,162297,5714469,Closed,currency 1,-1896,0,14985.0,76878.45,0.0,0.0,0.0,Consumer credit
8,162297,5714470,Closed,currency 1,-1146,0,0.0,103007.7,0.0,0.0,0.0,Consumer credit
9,162297,5714471,Active,currency 1,-1146,0,0.0,4500.0,0.0,0.0,0.0,Credit card


In [5]:
new_df3.shape

(65499, 15)

In [6]:
new_df3.dtypes

ID                            int64
Default                       int64
Gender                       object
Own_Car                      object
Own_Residence                object
Num_of_Child                  int64
Income                      float64
Amount_Credit               float64
Annuity                     float64
Price                       float64
Region_Population_Factor    float64
Birth_Day_CNT                 int64
Employed_Day_CNT              int64
Registration_Day_CNT          int64
Car_Age                     float64
dtype: object

In [7]:
credit.shape

(65499, 12)

In [8]:
credit.dtypes

ID                  int64
File_ID             int64
Active_Credit      object
Currency           object
Credit_Days         int64
Overdue_Days        int64
Max_Overdue       float64
Credit            float64
Credit_Debt       float64
Credit_Limit      float64
Credit_Overdue    float64
Type               object
dtype: object

In [9]:
def conv_obj(item):
    try:
        return float(item)
    except:
        return 0

In [10]:
new_df3['Annuity'] = new_df3['Annuity'].apply(conv_obj)
# new_df3['Price'] = new_df3['Price'].apply(lambda x:conv_obj(x))

In [11]:
new_df3['Annuity'].head()

0    22234.5
1    12375.0
2    15201.0
3    45954.0
4    47961.0
Name: Annuity, dtype: float64

In [12]:
new_df3['Employed_Day_CNT'] = (new_df3['Employed_Day_CNT']).apply(lambda x: x if x!=365243 else np.nan)
new_df3['Income/Credit_perc'] = new_df3['Income']/new_df3['Amount_Credit']
new_df3['Annuity/Income_perc'] = (new_df3['Annuity']/(1.0+new_df3['Income']))**0.5
new_df3['Payment_rate'] = new_df3['Annuity'] / new_df3['Amount_Credit']
new_df3['Loan/Income_ratio'] = new_df3['Amount_Credit'] / new_df3['Income']
new_df3['Credits/Goods_ratio'] = new_df3['Amount_Credit'] / new_df3['Price']

In [13]:
new_df3.dtypes

ID                            int64
Default                       int64
Gender                       object
Own_Car                      object
Own_Residence                object
Num_of_Child                  int64
Income                      float64
Amount_Credit               float64
Annuity                     float64
Price                       float64
Region_Population_Factor    float64
Birth_Day_CNT                 int64
Employed_Day_CNT            float64
Registration_Day_CNT          int64
Car_Age                     float64
Income/Credit_perc          float64
Annuity/Income_perc         float64
Payment_rate                float64
Loan/Income_ratio           float64
Credits/Goods_ratio         float64
dtype: object

In [14]:
new_df3.head()

Unnamed: 0,ID,Default,Gender,Own_Car,Own_Residence,Num_of_Child,Income,Amount_Credit,Annuity,Price,Region_Population_Factor,Birth_Day_CNT,Employed_Day_CNT,Registration_Day_CNT,Car_Age,Income/Credit_perc,Annuity/Income_perc,Payment_rate,Loan/Income_ratio,Credits/Goods_ratio
0,446165,0,F,N,Y,0,247500.0,756000.0,22234.5,756000.0,0.026392,-17106,-8438.0,-7502,,0.327381,0.299727,0.029411,3.054545,1.0
1,150009,0,F,N,N,0,46566.0,450000.0,12375.0,450000.0,0.028663,-19454,-899.0,-3224,,0.10348,0.515506,0.0275,9.663703,1.0
2,247486,0,F,N,Y,0,76500.0,295168.5,15201.0,238500.0,0.010556,-16139,-2180.0,-3410,,0.259174,0.445761,0.051499,3.858412,1.237604
3,164890,0,M,N,N,0,162000.0,900000.0,45954.0,900000.0,0.006008,-19430,-805.0,-4366,,0.18,0.532602,0.05106,5.555556,1.0
4,353539,0,M,Y,Y,1,180000.0,445333.5,47961.0,423000.0,0.018801,-10681,-1921.0,-598,5.0,0.404191,0.516187,0.107697,2.474075,1.052798


In [15]:
credit.dtypes

ID                  int64
File_ID             int64
Active_Credit      object
Currency           object
Credit_Days         int64
Overdue_Days        int64
Max_Overdue       float64
Credit            float64
Credit_Debt       float64
Credit_Limit      float64
Credit_Overdue    float64
Type               object
dtype: object

In [19]:
def one_hot_encoder(df, nan_as_category = True):
    original_columns = list(df.columns)
    categorical_columns = [col for col in df.columns if df[col].dtype == 'object' and col!='_id']
    df = pd.get_dummies(df, columns= categorical_columns, dummy_na= nan_as_category)
    new_columns = [c for c in df.columns if c not in original_columns]
    return df, new_columns

num_aggregations = {
    'Credit_Days': ['max', 'mean'],
    'Overdue_Days': ['max', 'mean'],
    'Max_Overdue': ['max', 'mean'],
    'Credit': ['median'],
    'Credit_Debt': ['mean'],
    'Credit_Overdue': ['max', 'mean']}
cat_aggregations = {}
new_bureau, bureau_cat = one_hot_encoder(credit, True)
for cat in bureau_cat: cat_aggregations[cat] = ['median']
    
bureau_agg = new_bureau.groupby('ID').agg({**num_aggregations, **cat_aggregations})
temp_col = bureau_agg.columns.tolist()
bureau_agg.columns = pd.Index(['BURO_' + e[0] + "_" + e[1].upper() for e in bureau_agg.columns.tolist()])
# Bureau: Active credits - using only numerical aggregations
active = new_bureau[new_bureau['Active_Credit_Active'] == 1]
active_agg = active.groupby('ID').agg(num_aggregations)
active_agg.columns = pd.Index(['ACTIVE_' + e[0] + "_" + e[1].upper() for e in active_agg.columns.tolist()])

In [20]:
temp_col

[('Credit_Days', 'max'),
 ('Credit_Days', 'mean'),
 ('Overdue_Days', 'max'),
 ('Overdue_Days', 'mean'),
 ('Max_Overdue', 'max'),
 ('Max_Overdue', 'mean'),
 ('Credit', 'median'),
 ('Credit_Debt', 'mean'),
 ('Credit_Overdue', 'max'),
 ('Credit_Overdue', 'mean'),
 ('Active_Credit_Active', 'median'),
 ('Active_Credit_Bad debt', 'median'),
 ('Active_Credit_Closed', 'median'),
 ('Active_Credit_Sold', 'median'),
 ('Active_Credit_nan', 'median'),
 ('Currency_currency 1', 'median'),
 ('Currency_currency 2', 'median'),
 ('Currency_currency 3', 'median'),
 ('Currency_currency 4', 'median'),
 ('Currency_nan', 'median'),
 ('Type_Another type of loan', 'median'),
 ('Type_Car loan', 'median'),
 ('Type_Cash loan (non-earmarked)', 'median'),
 ('Type_Consumer credit', 'median'),
 ('Type_Credit card', 'median'),
 ('Type_Loan for business development', 'median'),
 ('Type_Loan for working capital replenishment', 'median'),
 ('Type_Microloan', 'median'),
 ('Type_Mortgage', 'median'),
 ('Type_Real estate loa

In [32]:
credit.head()

Unnamed: 0,ID,File_ID,Active_Credit,Currency,Credit_Days,Overdue_Days,Max_Overdue,Credit,Credit_Debt,Credit_Limit,Credit_Overdue,Type
0,215354,5714462,Closed,currency 1,-497,0,,91323.0,0.0,,0.0,Consumer credit
1,215354,5714463,Active,currency 1,-208,0,,225000.0,171342.0,,0.0,Credit card
2,215354,5714464,Active,currency 1,-203,0,,464323.5,,,0.0,Consumer credit
3,215354,5714465,Active,currency 1,-203,0,,90000.0,,,0.0,Credit card
4,215354,5714466,Active,currency 1,-629,0,77674.5,2700000.0,,,0.0,Consumer credit


In [33]:
active_agg.head()

Unnamed: 0_level_0,ACTIVE_Credit_Days_MAX,ACTIVE_Credit_Days_MEAN,ACTIVE_Overdue_Days_MAX,ACTIVE_Overdue_Days_MEAN,ACTIVE_Max_Overdue_MAX,ACTIVE_Max_Overdue_MEAN,ACTIVE_Credit_MEDIAN,ACTIVE_Credit_Debt_MEAN,ACTIVE_Credit_Overdue_MAX,ACTIVE_Credit_Overdue_MEAN
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
100016,-128,-137.0,0,0.0,,,88996.5,63724.5,0.0,0.0
100053,-2338,-2338.0,0,0.0,,,48820.5,,0.0,0.0
100090,-315,-315.0,0,0.0,,,51957.0,21648.555,0.0,0.0
100166,-106,-206.0,0,0.0,0.0,0.0,86269.5,53207.2125,0.0,0.0
100245,-584,-910.75,0,0.0,1665.0,1665.0,540000.0,345627.0,0.0,0.0


In [34]:
bureau_agg=bureau_agg.reset_index()
active_agg=active_agg.reset_index()
bureau_agg = bureau_agg.merge(active_agg, how='left', on='ID')
for bin_feature in ['Gender', 'Own_Car']:
    new_df3[bin_feature], uniques = pd.factorize(new_df3[bin_feature])
new_df3, cat_cols = one_hot_encoder(new_df3, True)
raw_agg = new_df3.merge(bureau_agg, how='left', on='ID')
raw_agg=raw_agg.fillna(0)
train_df=raw_agg


In [35]:
bureau_agg.head()

Unnamed: 0,ID,BURO_Credit_Days_MAX,BURO_Credit_Days_MEAN,BURO_Overdue_Days_MAX,BURO_Overdue_Days_MEAN,BURO_Max_Overdue_MAX,BURO_Max_Overdue_MEAN,BURO_Credit_MEDIAN,BURO_Credit_Debt_MEAN,BURO_Credit_Overdue_MAX,...,ACTIVE_Credit_Days_MAX,ACTIVE_Credit_Days_MEAN,ACTIVE_Overdue_Days_MAX,ACTIVE_Overdue_Days_MEAN,ACTIVE_Max_Overdue_MAX,ACTIVE_Max_Overdue_MEAN,ACTIVE_Credit_MEDIAN,ACTIVE_Credit_Debt_MEAN,ACTIVE_Credit_Overdue_MAX,ACTIVE_Credit_Overdue_MEAN
0,100016,-128,-677.833333,0,0.0,0.0,0.0,73746.0,15931.125,0.0,...,-128.0,-137.0,0.0,0.0,,,88996.5,63724.5,0.0,0.0
1,100053,-1764,-2287.714286,0,0.0,,,77850.0,0.0,0.0,...,-2338.0,-2338.0,0.0,0.0,,,48820.5,,0.0,0.0
2,100090,-315,-315.0,0,0.0,,,51957.0,21648.555,0.0,...,-315.0,-315.0,0.0,0.0,,,51957.0,21648.555,0.0,0.0
3,100165,-1225,-1775.0,0,0.0,0.0,0.0,59034.24,0.0,0.0,...,,,,,,,,,,
4,100166,-106,-906.538462,0,0.0,9147.645,1706.608125,28476.0,8185.725,0.0,...,-106.0,-206.0,0.0,0.0,0.0,0.0,86269.5,53207.2125,0.0,0.0


In [36]:
import gc
folds = StratifiedKFold(n_splits= 5, shuffle=True, random_state=42)
oof_preds = np.zeros(train_df.shape[0])
feature_importance_df = pd.DataFrame()
feats = [f for f in train_df.columns if f not in ['Default','ID','File_ID','index']]
    
for n_fold, (train_idx, valid_idx) in enumerate(folds.split(train_df[feats], train_df['Default'])):
    train_x, train_y = train_df[feats].iloc[train_idx], train_df['Default'].iloc[train_idx]
    valid_x, valid_y = train_df[feats].iloc[valid_idx], train_df['Default'].iloc[valid_idx]

    clf = LGBMClassifier(boosting_type='goss',nthread=4,n_estimators=2000,learning_rate=0.003,
        num_leaves=300,colsample_bytree=0.70,subsample=0.85,max_depth=10, reg_alpha=0.00,metric='binary_logloss',
        reg_lambda=0.00,min_child_weight=30,silent=-1,verbose=1)

    clf.fit(train_x, train_y, eval_set=[(train_x, train_y), (valid_x, valid_y)], eval_metric= 'auc', verbose= 100, early_stopping_rounds= 30)

    oof_preds[valid_idx] = clf.predict_proba(valid_x, num_iteration=clf.best_iteration_)[:, 1]
    fold_importance_df = pd.DataFrame()
    fold_importance_df["feature"] = feats
    fold_importance_df["importance"] = clf.feature_importances_
    fold_importance_df["fold"] = n_fold + 1
    feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)
    print('Fold %2d AUC : %.6f' % (n_fold + 1, roc_auc_score(valid_y, oof_preds[valid_idx])))
    gc.collect()

print('Full AUC score %.6f' % roc_auc_score(train_df['Default'], oof_preds))

Training until validation scores don't improve for 30 rounds.
[100]	training's binary_logloss: 0.273077	training's auc: 0.734227	valid_1's binary_logloss: 0.275593	valid_1's auc: 0.669285
Early stopping, best iteration is:
[138]	training's binary_logloss: 0.270709	training's auc: 0.737347	valid_1's binary_logloss: 0.274133	valid_1's auc: 0.67034
Fold  1 AUC : 0.670340
Training until validation scores don't improve for 30 rounds.
[100]	training's binary_logloss: 0.27298	training's auc: 0.731797	valid_1's binary_logloss: 0.275648	valid_1's auc: 0.667955
[200]	training's binary_logloss: 0.267279	training's auc: 0.741127	valid_1's binary_logloss: 0.272265	valid_1's auc: 0.67148
Early stopping, best iteration is:
[200]	training's binary_logloss: 0.267279	training's auc: 0.741127	valid_1's binary_logloss: 0.272265	valid_1's auc: 0.67148
Fold  2 AUC : 0.671480
Training until validation scores don't improve for 30 rounds.
Early stopping, best iteration is:
[61]	training's binary_logloss: 0.275