In [1]:
import os
import math
import subprocess
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
from IPython.display import display

#
from utils_feature_engineering import *

# some settings for displaying Pandas results
pd.set_option('display.width', 2000)
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.precision', 4)
pd.set_option('display.max_colwidth', -1)

  pd.set_option('display.max_colwidth', -1)


# Load Data

In [2]:
# load train/test data
path = r'D:\NEU\Năm 3\DATA PREP\PROJECT_DATAPREP\GITHUB REPO\dataset\dseb63_final_project_DP_dataset\\'

# train
application_train = pd.read_csv(path + 'dseb63_' + 'application_train.csv')
application_train.drop(columns='Unnamed: 0', inplace=True)

# filter by tvt code
application_tvt_extend = pd.read_pickle("application_tvt_extend.pkl", compression="bz2")
application_train_filtered = (application_tvt_extend.query("tvt_code == 'train'")
                              .merge(application_train[["SK_ID_CURR"]], on="SK_ID_CURR")
                              .drop(columns=["tvt_code"]))
application_train_filtered.head()
application_train_filtered = application_train[['SK_ID_CURR','TARGET']]

In [3]:
# Load bureau
bureau = pd.read_csv(path + 'dseb63_' + 'bureau.csv')
print(bureau.shape)
bureau.head()

(1465325, 17)


Unnamed: 0,SK_ID_BUREAU,CREDIT_ACTIVE,CREDIT_CURRENCY,DAYS_CREDIT,CREDIT_DAY_OVERDUE,DAYS_CREDIT_ENDDATE,DAYS_ENDDATE_FACT,AMT_CREDIT_MAX_OVERDUE,CNT_CREDIT_PROLONG,AMT_CREDIT_SUM,AMT_CREDIT_SUM_DEBT,AMT_CREDIT_SUM_LIMIT,AMT_CREDIT_SUM_OVERDUE,CREDIT_TYPE,DAYS_CREDIT_UPDATE,AMT_ANNUITY,SK_ID_CURR
0,5714462,Closed,currency 1,-497,0,-153.0,-153.0,,0,91323.0,0.0,,0.0,Consumer credit,-131,,254629
1,5714463,Active,currency 1,-208,0,1075.0,,,0,225000.0,171342.0,,0.0,Credit card,-20,,254629
2,5714464,Active,currency 1,-203,0,528.0,,,0,464323.5,,,0.0,Consumer credit,-16,,254629
3,5714465,Active,currency 1,-203,0,,,,0,90000.0,,,0.0,Credit card,-16,,254629
4,5714466,Active,currency 1,-629,0,1197.0,,77674.5,0,2700000.0,,,0.0,Consumer credit,-21,,254629


# Preprocess data

## One hot and frequency

- CREDIT_ACTIVE: Tình trạng của khoản vay
- CREDIT_CURRENCY

In [4]:
bureau['CREDIT_ACTIVE'].value_counts()

Closed      917733
Active      541919
Sold        5653  
Bad debt    20    
Name: CREDIT_ACTIVE, dtype: int64

In [5]:
# những khoản vay bị bán (Sold) được xem là Bad debt
print('Before :')
print(bureau['CREDIT_ACTIVE'].value_counts())
print('-'*50)

bureau.loc[bureau['CREDIT_ACTIVE'] == 'Sold', 'CREDIT_ACTIVE'] = 'Bad debt'
print('After :')
print(bureau['CREDIT_ACTIVE'].value_counts())

Before :
Closed      917733
Active      541919
Sold        5653  
Bad debt    20    
Name: CREDIT_ACTIVE, dtype: int64
--------------------------------------------------
After :
Closed      917733
Active      541919
Bad debt    5673  
Name: CREDIT_ACTIVE, dtype: int64


In [6]:
bureau['CREDIT_CURRENCY'].value_counts()

currency 1    1464094
currency 2    1072   
currency 3    150    
currency 4    9      
Name: CREDIT_CURRENCY, dtype: int64

In [7]:
# những curency hiếm gặp (currency 3,4) sẽ đưa vào chung một loại currency 2
print('Before : ')
print(bureau['CREDIT_CURRENCY'].value_counts())
print('-'*50)

bureau.loc[bureau['CREDIT_CURRENCY'].isin(['currency 3','currency 4']), 'CREDIT_CURRENCY'] = 'currency 2'
print('After :')
print(bureau['CREDIT_CURRENCY'].value_counts())

Before : 
currency 1    1464094
currency 2    1072   
currency 3    150    
currency 4    9      
Name: CREDIT_CURRENCY, dtype: int64
--------------------------------------------------
After :
currency 1    1464094
currency 2    1231   
Name: CREDIT_CURRENCY, dtype: int64


In [8]:
bureau['CREDIT_TYPE'].value_counts(dropna=False)

Consumer credit                                 1069610
Credit card                                     343853 
Car loan                                        23757  
Mortgage                                        15607  
Microloan                                       9005   
Loan for business development                   1721   
Another type of loan                            854    
Unknown type of loan                            472    
Loan for working capital replenishment          377    
Real estate loan                                23     
Cash loan (non-earmarked)                       23     
Loan for the purchase of equipment              17     
Loan for purchase of shares (margin lending)    4      
Mobile operator loan                            1      
Interbank credit                                1      
Name: CREDIT_TYPE, dtype: int64

In [9]:
df_distr_credit_type = bureau['CREDIT_TYPE'].value_counts().to_frame('freq')
df_distr_credit_type

Unnamed: 0,freq
Consumer credit,1069610
Credit card,343853
Car loan,23757
Mortgage,15607
Microloan,9005
Loan for business development,1721
Another type of loan,854
Unknown type of loan,472
Loan for working capital replenishment,377
Real estate loan,23


In [10]:
# dựa vào phần trăm bad credit > 
df_check_credit_type = application_train_filtered.merge(bureau[['SK_ID_CURR', 'CREDIT_TYPE']])\
                                     .groupby(["CREDIT_TYPE", "TARGET"]).size().to_frame("count")

df_check_credit_type = df_check_credit_type.reset_index().set_index('CREDIT_TYPE')
df_check_credit_type

Unnamed: 0_level_0,TARGET,count
CREDIT_TYPE,Unnamed: 1_level_1,Unnamed: 2_level_1
Another type of loan,0,621
Another type of loan,1,47
Car loan,0,18039
Car loan,1,1075
Cash loan (non-earmarked),0,18
Cash loan (non-earmarked),1,1
Consumer credit,0,792209
Consumer credit,1,64215
Credit card,0,251169
Credit card,1,24173


In [11]:
# pct bad credit của từng hình thức tín dụng
df_pct = df_check_credit_type[df_check_credit_type['TARGET'] == 1].join(df_distr_credit_type)
df_pct['pct'] = df_pct['count'] * 100.0 / df_pct['freq']
df_pct.sort_values(by = 'pct')

Unnamed: 0_level_0,TARGET,count,freq,pct
CREDIT_TYPE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Mortgage,1,630,15607,4.0367
Cash loan (non-earmarked),1,1,23,4.3478
Car loan,1,1075,23757,4.525
Loan for business development,1,83,1721,4.8228
Unknown type of loan,1,23,472,4.8729
Another type of loan,1,47,854,5.5035
Consumer credit,1,64215,1069610,6.0036
Credit card,1,24173,343853,7.03
Loan for working capital replenishment,1,35,377,9.2838
Microloan,1,1476,9005,16.3909


In [12]:
# dựa vào phần trăm tín dụng xấu lớn hơn 9% và có phân bố nhỏ sẽ được xếp vào good/bad loan
list_good_loan = [
    "Mobile operator loan",
    "Interbank credit",
    "Loan for purchase of shares (margin lending)",
    "Real estate loan"
    "Cash loan (non-earmarked)",
    "Another type of loan"
]

list_bad_loan = [
    "Loan for the purchase of equipment",
    "Microloan",
    "Loan for working capital replenishment"
]

bureau.loc[bureau['CREDIT_TYPE'].isin(list_good_loan), 'CREDIT_TYPE'] = 'Consumer credit'
bureau.loc[bureau['CREDIT_TYPE'].isin(list_bad_loan), 'CREDIT_TYPE'] = 'Bad loan'

print('After : ')
bureau['CREDIT_TYPE'].value_counts()

After : 


Consumer credit                  1070470
Credit card                      343853 
Car loan                         23757  
Mortgage                         15607  
Bad loan                         9399   
Loan for business development    1721   
Unknown type of loan             472    
Real estate loan                 23     
Cash loan (non-earmarked)        23     
Name: CREDIT_TYPE, dtype: int64

In [13]:
%%time
dict_feat = {
    "CREDIT_ACTIVE": ['Closed', 'Active', 'Bad debt'],
    "CREDIT_CURRENCY": ['currency 1', 'currency 2'],
    "CREDIT_TYPE": ["Consumer credit", "Credit card", "Car loan", "Mortgage", "Loan for business development", "Unknown type of loan", "Bad loan"],
}
df_onehot = gen_one_hot_feat(bureau, dict_feat)
df_onehot

Wall time: 6.47 s


Unnamed: 0,SK_ID_CURR,CREDIT_ACTIVE_Closed,CREDIT_ACTIVE_Active,CREDIT_ACTIVE_Bad_debt,CREDIT_CURRENCY_currency_1,CREDIT_CURRENCY_currency_2,CREDIT_TYPE_Consumer_credit,CREDIT_TYPE_Credit_card,CREDIT_TYPE_Car_loan,CREDIT_TYPE_Mortgage,CREDIT_TYPE_Loan_for_business_development,CREDIT_TYPE_Unknown_type_of_loan,CREDIT_TYPE_Bad_loan
0,254629,1,0,0,1,0,1,0,0,0,0,0,0
1,254629,0,1,0,1,0,0,1,0,0,0,0,0
2,254629,0,1,0,1,0,1,0,0,0,0,0,0
3,254629,0,1,0,1,0,0,1,0,0,0,0,0
4,254629,0,1,0,1,0,1,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1465320,66820,1,0,0,1,0,1,0,0,0,0,0,0
1465321,166221,1,0,0,1,0,1,0,0,0,0,0,0
1465322,118290,0,1,0,1,0,0,0,0,1,0,0,0
1465323,95296,1,0,0,1,0,1,0,0,0,0,0,0


In [14]:
df_agg01 = agg_common_data(df_onehot, ["max", "sum", "mean"])
eval_agg01 = feature_evaluate(application_train_filtered, df_agg01.reset_index())
display(eval_agg01)

{'CREDIT_ACTIVE_Closed': ['max', 'sum', 'mean'],
 'CREDIT_ACTIVE_Active': ['max', 'sum', 'mean'],
 'CREDIT_ACTIVE_Bad_debt': ['max', 'sum', 'mean'],
 'CREDIT_CURRENCY_currency_1': ['max', 'sum', 'mean'],
 'CREDIT_CURRENCY_currency_2': ['max', 'sum', 'mean'],
 'CREDIT_TYPE_Consumer_credit': ['max', 'sum', 'mean'],
 'CREDIT_TYPE_Credit_card': ['max', 'sum', 'mean'],
 'CREDIT_TYPE_Car_loan': ['max', 'sum', 'mean'],
 'CREDIT_TYPE_Mortgage': ['max', 'sum', 'mean'],
 'CREDIT_TYPE_Loan_for_business_development': ['max', 'sum', 'mean'],
 'CREDIT_TYPE_Unknown_type_of_loan': ['max', 'sum', 'mean'],
 'CREDIT_TYPE_Bad_loan': ['max', 'sum', 'mean']}

After agg: (263491, 36)


Unnamed: 0,name,auc,corr,coverage
2,CREDIT_ACTIVE_Closed_mean,0.5866,-0.0798,1.0
5,CREDIT_ACTIVE_Active_mean,0.5844,0.0777,1.0
4,CREDIT_ACTIVE_Active_sum,0.5607,0.0669,1.0
1,CREDIT_ACTIVE_Closed_sum,0.548,-0.031,1.0
20,CREDIT_TYPE_Credit_card_mean,0.5358,0.0338,1.0
19,CREDIT_TYPE_Credit_card_sum,0.5297,0.0337,1.0
0,CREDIT_ACTIVE_Closed_max,0.5292,-0.0471,1.0
17,CREDIT_TYPE_Consumer_credit_mean,0.529,-0.0257,1.0
3,CREDIT_ACTIVE_Active_max,0.5259,0.0363,1.0
18,CREDIT_TYPE_Credit_card_max,0.5181,0.0204,1.0


In [15]:
eval_agg01[eval_agg01['auc'] <= .501].shape

(11, 4)

In [16]:
selected_feat = eval_agg01[eval_agg01['auc'] > 0.501]['name'].tolist()
df_agg01 = df_agg01[selected_feat]
df_agg01.shape

(263491, 25)

## day to years

- DAYS_CREDIT: Ngày vay so với hiện tại
- CREDIT_DAY_OVERDUE
- DAYS_CREDIT_UPDATE
- DAYS_CREDIT_ENDDATE
- DAYS_ENDDATE_FACT

In [17]:
def days_to_years(df_input, list_cols):
    df_out = df_input[['SK_ID_CURR'] + list_cols].copy()
    col_out = []

    for cname in list_cols:
        new_name = '{}_TO_YEARS'.format(cname)
        col_out.append(new_name)
        df_out[new_name] = df_out['DAYS_CREDIT'] / -365

    return df_out[['SK_ID_CURR'] + col_out]


In [18]:
list_cols = ["DAYS_CREDIT", "CREDIT_DAY_OVERDUE",
             "DAYS_CREDIT_UPDATE", "DAYS_CREDIT_ENDDATE", "DAYS_ENDDATE_FACT"]

df_years = days_to_years(bureau, list_cols)
df_years.head()

Unnamed: 0,SK_ID_CURR,DAYS_CREDIT_TO_YEARS,CREDIT_DAY_OVERDUE_TO_YEARS,DAYS_CREDIT_UPDATE_TO_YEARS,DAYS_CREDIT_ENDDATE_TO_YEARS,DAYS_ENDDATE_FACT_TO_YEARS
0,254629,1.3616,1.3616,1.3616,1.3616,1.3616
1,254629,0.5699,0.5699,0.5699,0.5699,0.5699
2,254629,0.5562,0.5562,0.5562,0.5562,0.5562
3,254629,0.5562,0.5562,0.5562,0.5562,0.5562
4,254629,1.7233,1.7233,1.7233,1.7233,1.7233


In [19]:
df_agg02 = agg_common_data(df_years, ["min", "max", "mean", "std", "median"], main_key='SK_ID_CURR')
df_agg02.head()

{'DAYS_CREDIT_TO_YEARS': ['min', 'max', 'mean', 'std', 'median'],
 'CREDIT_DAY_OVERDUE_TO_YEARS': ['min', 'max', 'mean', 'std', 'median'],
 'DAYS_CREDIT_UPDATE_TO_YEARS': ['min', 'max', 'mean', 'std', 'median'],
 'DAYS_CREDIT_ENDDATE_TO_YEARS': ['min', 'max', 'mean', 'std', 'median'],
 'DAYS_ENDDATE_FACT_TO_YEARS': ['min', 'max', 'mean', 'std', 'median']}

After agg: (263491, 25)


Unnamed: 0_level_0,DAYS_CREDIT_TO_YEARS_min,DAYS_CREDIT_TO_YEARS_max,DAYS_CREDIT_TO_YEARS_mean,DAYS_CREDIT_TO_YEARS_std,DAYS_CREDIT_TO_YEARS_median,CREDIT_DAY_OVERDUE_TO_YEARS_min,CREDIT_DAY_OVERDUE_TO_YEARS_max,CREDIT_DAY_OVERDUE_TO_YEARS_mean,CREDIT_DAY_OVERDUE_TO_YEARS_std,CREDIT_DAY_OVERDUE_TO_YEARS_median,DAYS_CREDIT_UPDATE_TO_YEARS_min,DAYS_CREDIT_UPDATE_TO_YEARS_max,DAYS_CREDIT_UPDATE_TO_YEARS_mean,DAYS_CREDIT_UPDATE_TO_YEARS_std,DAYS_CREDIT_UPDATE_TO_YEARS_median,DAYS_CREDIT_ENDDATE_TO_YEARS_min,DAYS_CREDIT_ENDDATE_TO_YEARS_max,DAYS_CREDIT_ENDDATE_TO_YEARS_mean,DAYS_CREDIT_ENDDATE_TO_YEARS_std,DAYS_CREDIT_ENDDATE_TO_YEARS_median,DAYS_ENDDATE_FACT_TO_YEARS_min,DAYS_ENDDATE_FACT_TO_YEARS_max,DAYS_ENDDATE_FACT_TO_YEARS_mean,DAYS_ENDDATE_FACT_TO_YEARS_std,DAYS_ENDDATE_FACT_TO_YEARS_median
SK_ID_CURR,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1
0,0.1726,0.1726,0.1726,,0.1726,0.1726,0.1726,0.1726,,0.1726,0.1726,0.1726,0.1726,,0.1726,0.1726,0.1726,0.1726,,0.1726,0.1726,0.1726,0.1726,,0.1726
1,6.4329,6.4329,6.4329,,6.4329,6.4329,6.4329,6.4329,,6.4329,6.4329,6.4329,6.4329,,6.4329,6.4329,6.4329,6.4329,,6.4329,6.4329,6.4329,6.4329,,6.4329
2,0.0822,7.9479,2.2201,3.0295,1.0096,0.0822,7.9479,2.2201,3.0295,1.0096,0.0822,7.9479,2.2201,3.0295,1.0096,0.0822,7.9479,2.2201,3.0295,1.0096,0.0822,7.9479,2.2201,3.0295,1.0096
3,0.3178,7.8493,3.0998,3.076,1.8493,0.3178,7.8493,3.0998,3.076,1.8493,0.3178,7.8493,3.0998,3.076,1.8493,0.3178,7.8493,3.0998,3.076,1.8493,0.3178,7.8493,3.0998,3.076,1.8493
4,0.8575,2.8932,2.1187,0.9012,2.5986,0.8575,2.8932,2.1187,0.9012,2.5986,0.8575,2.8932,2.1187,0.9012,2.5986,0.8575,2.8932,2.1187,0.9012,2.5986,0.8575,2.8932,2.1187,0.9012,2.5986


In [20]:
eval_agg02 = feature_evaluate(application_train_filtered, df_agg02.reset_index())
display(eval_agg02)

Unnamed: 0,name,auc,corr,coverage
12,DAYS_CREDIT_UPDATE_TO_YEARS_mean,0.6029,-0.0896,1.0
2,DAYS_CREDIT_TO_YEARS_mean,0.6029,-0.0896,1.0
22,DAYS_ENDDATE_FACT_TO_YEARS_mean,0.6029,-0.0896,1.0
17,DAYS_CREDIT_ENDDATE_TO_YEARS_mean,0.6029,-0.0896,1.0
7,CREDIT_DAY_OVERDUE_TO_YEARS_mean,0.6029,-0.0896,1.0
9,CREDIT_DAY_OVERDUE_TO_YEARS_median,0.6023,-0.0855,1.0
19,DAYS_CREDIT_ENDDATE_TO_YEARS_median,0.6023,-0.0855,1.0
14,DAYS_CREDIT_UPDATE_TO_YEARS_median,0.6023,-0.0855,1.0
24,DAYS_ENDDATE_FACT_TO_YEARS_median,0.6023,-0.0855,1.0
4,DAYS_CREDIT_TO_YEARS_median,0.6023,-0.0855,1.0


## khách hàng sử dụng trong vòng 1 năm

In [21]:
df_filtered021 = df_years[["SK_ID_CURR", "DAYS_CREDIT_TO_YEARS"]].copy()
df_filtered021["YEARS_CREDIT_1year"] = 0
idx_query = (df_years["DAYS_CREDIT_TO_YEARS"] >= 1) & (df_years["DAYS_CREDIT_TO_YEARS"] < 2)
df_filtered021.loc[idx_query, "YEARS_CREDIT_1year"] = 1
df_filtered021.drop(columns=["DAYS_CREDIT_TO_YEARS"], inplace=True)

#
df_agg021 = agg_common_data(df_filtered021, ["max", "sum"])
eval_agg021 = feature_evaluate(application_train_filtered, df_agg021.reset_index())
display(eval_agg021)

{'YEARS_CREDIT_1year': ['max', 'sum']}

After agg: (263491, 2)


Unnamed: 0,name,auc,corr,coverage
1,YEARS_CREDIT_1year_sum,0.5217,0.0308,1.0
0,YEARS_CREDIT_1year_max,0.5099,0.0107,1.0


## khách hàng sử dụng trong vòng 2 năm

In [22]:
df_filtered022 = df_years[["SK_ID_CURR", "DAYS_CREDIT_TO_YEARS"]].copy()
df_filtered022["YEARS_CREDIT_2year"] = 0
idx_query = (df_years["DAYS_CREDIT_TO_YEARS"] >= 2) & (df_years["DAYS_CREDIT_TO_YEARS"] < 3)
df_filtered022.loc[idx_query, "YEARS_CREDIT_2year"] = 1
df_filtered022.drop(columns=["DAYS_CREDIT_TO_YEARS"], inplace=True)

#
df_agg022 = agg_common_data(df_filtered022, ["max", "sum"])
eval_agg022 = feature_evaluate(application_train_filtered, df_agg022.reset_index())
display(eval_agg022)

{'YEARS_CREDIT_2year': ['max', 'sum']}

After agg: (263491, 2)


Unnamed: 0,name,auc,corr,coverage
0,YEARS_CREDIT_2year_max,0.5195,-0.0208,1.0
1,YEARS_CREDIT_2year_sum,0.518,-0.0088,1.0


## khách hàng sử dụng trong vòng lâu năm ( > 3 years)

In [23]:
df_filtered023 = df_years[["SK_ID_CURR", "DAYS_CREDIT_TO_YEARS"]].copy()
df_filtered023["YEARS_CREDIT_3year"] = 0
idx_query = df_years["DAYS_CREDIT_TO_YEARS"] >= 3
df_filtered023.loc[idx_query, "YEARS_CREDIT_3year"] = 1
df_filtered023.drop(columns=["DAYS_CREDIT_TO_YEARS"], inplace=True)

#
df_agg023 = agg_common_data(df_filtered023, ["max", "sum"])
eval_agg023 = feature_evaluate(application_train_filtered, df_agg023.reset_index())
display(eval_agg023)

{'YEARS_CREDIT_3year': ['max', 'sum']}

After agg: (263491, 2)


Unnamed: 0,name,auc,corr,coverage
1,YEARS_CREDIT_3year_sum,0.5686,-0.0503,1.0
0,YEARS_CREDIT_3year_max,0.5556,-0.0671,1.0


## keep columns

In [24]:
list_cols = [cname for cname in bureau.columns if "AMT" in cname] + ["CNT_CREDIT_PROLONG"]
df_amt = bureau[["SK_ID_CURR"] + list_cols]
df_amt.head()

Unnamed: 0,SK_ID_CURR,AMT_CREDIT_MAX_OVERDUE,AMT_CREDIT_SUM,AMT_CREDIT_SUM_DEBT,AMT_CREDIT_SUM_LIMIT,AMT_CREDIT_SUM_OVERDUE,AMT_ANNUITY,CNT_CREDIT_PROLONG
0,254629,,91323.0,0.0,,0.0,,0
1,254629,,225000.0,171342.0,,0.0,,0
2,254629,,464323.5,,,0.0,,0
3,254629,,90000.0,,,0.0,,0
4,254629,77674.5,2700000.0,,,0.0,,0


In [25]:
df_agg03 = agg_common_data(df_amt, ["min", "max", "mean", "std", "median"])

{'AMT_CREDIT_MAX_OVERDUE': ['min', 'max', 'mean', 'std', 'median'],
 'AMT_CREDIT_SUM': ['min', 'max', 'mean', 'std', 'median'],
 'AMT_CREDIT_SUM_DEBT': ['min', 'max', 'mean', 'std', 'median'],
 'AMT_CREDIT_SUM_LIMIT': ['min', 'max', 'mean', 'std', 'median'],
 'AMT_CREDIT_SUM_OVERDUE': ['min', 'max', 'mean', 'std', 'median'],
 'AMT_ANNUITY': ['min', 'max', 'mean', 'std', 'median'],
 'CNT_CREDIT_PROLONG': ['min', 'max', 'mean', 'std', 'median']}

After agg: (263491, 35)


In [26]:
eval_agg03 = feature_evaluate(application_train_filtered, df_agg03.reset_index())
display(eval_agg03)

Unnamed: 0,name,auc,corr,coverage
14,AMT_CREDIT_SUM_DEBT_median,0.559,0.00066159,0.9719
12,AMT_CREDIT_SUM_DEBT_mean,0.5566,-0.0016404,0.9719
13,AMT_CREDIT_SUM_DEBT_std,0.5501,-0.0038927,0.8125
3,AMT_CREDIT_MAX_OVERDUE_std,0.55,0.0077448,0.4499
2,AMT_CREDIT_MAX_OVERDUE_mean,0.5486,0.0025193,0.698
1,AMT_CREDIT_MAX_OVERDUE_max,0.5463,0.0033415,0.698
11,AMT_CREDIT_SUM_DEBT_max,0.5448,-0.0025605,0.9719
4,AMT_CREDIT_MAX_OVERDUE_median,0.5355,0.002321,0.698
18,AMT_CREDIT_SUM_LIMIT_std,0.5325,-0.011724,0.7284
29,AMT_ANNUITY_median,0.5313,-0.0013662,0.3029


In [27]:
eval_agg03[eval_agg03["auc"] <= 0.501].shape

(3, 4)

In [28]:
selected_feat = eval_agg03[eval_agg03["auc"] > 0.501]["name"].tolist()
df_agg03 = df_agg03[selected_feat]
print(df_agg03.shape)
df_agg03.head()

(263491, 32)


Unnamed: 0_level_0,AMT_CREDIT_SUM_DEBT_median,AMT_CREDIT_SUM_DEBT_mean,AMT_CREDIT_SUM_DEBT_std,AMT_CREDIT_MAX_OVERDUE_std,AMT_CREDIT_MAX_OVERDUE_mean,AMT_CREDIT_MAX_OVERDUE_max,AMT_CREDIT_SUM_DEBT_max,AMT_CREDIT_MAX_OVERDUE_median,AMT_CREDIT_SUM_LIMIT_std,AMT_ANNUITY_median,AMT_ANNUITY_mean,AMT_CREDIT_SUM_DEBT_min,AMT_CREDIT_SUM_LIMIT_max,AMT_CREDIT_SUM_LIMIT_mean,AMT_ANNUITY_std,AMT_ANNUITY_max,AMT_CREDIT_SUM_median,AMT_ANNUITY_min,AMT_CREDIT_SUM_mean,AMT_CREDIT_MAX_OVERDUE_min,AMT_CREDIT_SUM_min,AMT_CREDIT_SUM_max,AMT_CREDIT_SUM_LIMIT_median,AMT_CREDIT_SUM_OVERDUE_std,AMT_CREDIT_SUM_OVERDUE_mean,AMT_CREDIT_SUM_OVERDUE_max,AMT_CREDIT_SUM_std,CNT_CREDIT_PROLONG_std,AMT_CREDIT_SUM_LIMIT_min,CNT_CREDIT_PROLONG_max,CNT_CREDIT_PROLONG_mean,AMT_CREDIT_SUM_OVERDUE_median
SK_ID_CURR,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1
0,77566.5,77566.5,,,,,77566.5,,,,,77566.5,,,,,85513.5,,85513.5,,85513.5,85513.5,,,0.0,0.0,,,,0,0.0,0.0
1,0.0,0.0,,,11666.385,11666.385,0.0,11666.385,,,,0.0,0.0,0.0,,,28350.0,,28350.0,11666.385,28350.0,28350.0,0.0,,0.0,0.0,,,0.0,0,0.0,0.0
2,0.0,16069.5,25705.1637,5334.3994,3771.99,7543.98,58333.5,3771.99,0.0,,,0.0,0.0,0.0,,,30595.5,,42390.18,0.0,20656.08,86445.0,0.0,0.0,0.0,0.0,27030.0972,0.0,0.0,0,0.0,0.0
3,0.0,28395.6429,58477.0762,0.0,0.0,0.0,157698.0,0.0,0.0,,,0.0,0.0,0.0,,,62482.5,,69136.0714,0.0,19305.0,148500.0,0.0,0.0,0.0,0.0,41184.0783,0.0,0.0,0,0.0,0.0
4,130738.5,209084.25,264456.1574,,,,655510.5,,0.0,,,0.0,0.0,0.0,,,371250.0,,470424.0,,112500.0,945000.0,0.0,0.0,0.0,0.0,317857.683,0.0,0.0,0,0.0,0.0


# Saving features

In [29]:
df_feat = df_agg01.join(df_agg02).join(df_agg03).join(df_agg021).join(df_agg022).join(df_agg023)
print(df_feat.shape)
df_feat.head()

(263491, 88)


Unnamed: 0_level_0,CREDIT_ACTIVE_Closed_mean,CREDIT_ACTIVE_Active_mean,CREDIT_ACTIVE_Active_sum,CREDIT_ACTIVE_Closed_sum,CREDIT_TYPE_Credit_card_mean,CREDIT_TYPE_Credit_card_sum,CREDIT_ACTIVE_Closed_max,CREDIT_TYPE_Consumer_credit_mean,CREDIT_ACTIVE_Active_max,CREDIT_TYPE_Credit_card_max,CREDIT_TYPE_Consumer_credit_sum,CREDIT_TYPE_Bad_loan_sum,CREDIT_TYPE_Bad_loan_mean,CREDIT_TYPE_Bad_loan_max,CREDIT_TYPE_Car_loan_mean,CREDIT_TYPE_Car_loan_sum,CREDIT_TYPE_Car_loan_max,CREDIT_TYPE_Mortgage_mean,CREDIT_TYPE_Mortgage_sum,CREDIT_TYPE_Mortgage_max,CREDIT_CURRENCY_currency_1_sum,CREDIT_TYPE_Consumer_credit_max,CREDIT_ACTIVE_Bad_debt_mean,CREDIT_ACTIVE_Bad_debt_max,CREDIT_ACTIVE_Bad_debt_sum,DAYS_CREDIT_TO_YEARS_min,DAYS_CREDIT_TO_YEARS_max,DAYS_CREDIT_TO_YEARS_mean,DAYS_CREDIT_TO_YEARS_std,DAYS_CREDIT_TO_YEARS_median,CREDIT_DAY_OVERDUE_TO_YEARS_min,CREDIT_DAY_OVERDUE_TO_YEARS_max,CREDIT_DAY_OVERDUE_TO_YEARS_mean,CREDIT_DAY_OVERDUE_TO_YEARS_std,CREDIT_DAY_OVERDUE_TO_YEARS_median,DAYS_CREDIT_UPDATE_TO_YEARS_min,DAYS_CREDIT_UPDATE_TO_YEARS_max,DAYS_CREDIT_UPDATE_TO_YEARS_mean,DAYS_CREDIT_UPDATE_TO_YEARS_std,DAYS_CREDIT_UPDATE_TO_YEARS_median,DAYS_CREDIT_ENDDATE_TO_YEARS_min,DAYS_CREDIT_ENDDATE_TO_YEARS_max,DAYS_CREDIT_ENDDATE_TO_YEARS_mean,DAYS_CREDIT_ENDDATE_TO_YEARS_std,DAYS_CREDIT_ENDDATE_TO_YEARS_median,DAYS_ENDDATE_FACT_TO_YEARS_min,DAYS_ENDDATE_FACT_TO_YEARS_max,DAYS_ENDDATE_FACT_TO_YEARS_mean,DAYS_ENDDATE_FACT_TO_YEARS_std,DAYS_ENDDATE_FACT_TO_YEARS_median,AMT_CREDIT_SUM_DEBT_median,AMT_CREDIT_SUM_DEBT_mean,AMT_CREDIT_SUM_DEBT_std,AMT_CREDIT_MAX_OVERDUE_std,AMT_CREDIT_MAX_OVERDUE_mean,AMT_CREDIT_MAX_OVERDUE_max,AMT_CREDIT_SUM_DEBT_max,AMT_CREDIT_MAX_OVERDUE_median,AMT_CREDIT_SUM_LIMIT_std,AMT_ANNUITY_median,AMT_ANNUITY_mean,AMT_CREDIT_SUM_DEBT_min,AMT_CREDIT_SUM_LIMIT_max,AMT_CREDIT_SUM_LIMIT_mean,AMT_ANNUITY_std,AMT_ANNUITY_max,AMT_CREDIT_SUM_median,AMT_ANNUITY_min,AMT_CREDIT_SUM_mean,AMT_CREDIT_MAX_OVERDUE_min,AMT_CREDIT_SUM_min,AMT_CREDIT_SUM_max,AMT_CREDIT_SUM_LIMIT_median,AMT_CREDIT_SUM_OVERDUE_std,AMT_CREDIT_SUM_OVERDUE_mean,AMT_CREDIT_SUM_OVERDUE_max,AMT_CREDIT_SUM_std,CNT_CREDIT_PROLONG_std,AMT_CREDIT_SUM_LIMIT_min,CNT_CREDIT_PROLONG_max,CNT_CREDIT_PROLONG_mean,AMT_CREDIT_SUM_OVERDUE_median,YEARS_CREDIT_1year_max,YEARS_CREDIT_1year_sum,YEARS_CREDIT_2year_max,YEARS_CREDIT_2year_sum,YEARS_CREDIT_3year_max,YEARS_CREDIT_3year_sum
SK_ID_CURR,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1,Unnamed: 82_level_1,Unnamed: 83_level_1,Unnamed: 84_level_1,Unnamed: 85_level_1,Unnamed: 86_level_1,Unnamed: 87_level_1,Unnamed: 88_level_1
0,0.0,1.0,1,0,0.0,0,0,1.0,1,0,1,0,0.0,0,0.0,0,0,0.0,0,0,1,1,0.0,0,0,0.1726,0.1726,0.1726,,0.1726,0.1726,0.1726,0.1726,,0.1726,0.1726,0.1726,0.1726,,0.1726,0.1726,0.1726,0.1726,,0.1726,0.1726,0.1726,0.1726,,0.1726,77566.5,77566.5,,,,,77566.5,,,,,77566.5,,,,,85513.5,,85513.5,,85513.5,85513.5,,,0.0,0.0,,,,0,0.0,0.0,0,0,0,0,0,0
1,0.0,1.0,1,0,0.0,0,0,1.0,1,0,1,0,0.0,0,0.0,0,0,0.0,0,0,1,1,0.0,0,0,6.4329,6.4329,6.4329,,6.4329,6.4329,6.4329,6.4329,,6.4329,6.4329,6.4329,6.4329,,6.4329,6.4329,6.4329,6.4329,,6.4329,6.4329,6.4329,6.4329,,6.4329,0.0,0.0,,,11666.385,11666.385,0.0,11666.385,,,,0.0,0.0,0.0,,,28350.0,,28350.0,11666.385,28350.0,28350.0,0.0,,0.0,0.0,,,0.0,0,0.0,0.0,0,0,0,0,1,1
2,0.6667,0.3333,2,4,0.0,0,1,1.0,1,0,6,0,0.0,0,0.0,0,0,0.0,0,0,6,1,0.0,0,0,0.0822,7.9479,2.2201,3.0295,1.0096,0.0822,7.9479,2.2201,3.0295,1.0096,0.0822,7.9479,2.2201,3.0295,1.0096,0.0822,7.9479,2.2201,3.0295,1.0096,0.0822,7.9479,2.2201,3.0295,1.0096,0.0,16069.5,25705.1637,5334.3994,3771.99,7543.98,58333.5,3771.99,0.0,,,0.0,0.0,0.0,,,30595.5,,42390.18,0.0,20656.08,86445.0,0.0,0.0,0.0,0.0,27030.0972,0.0,0.0,0,0.0,0.0,1,1,0,0,1,2
3,0.5714,0.4286,3,4,0.1429,1,1,0.8571,1,1,6,0,0.0,0,0.0,0,0,0.0,0,0,7,1,0.0,0,0,0.3178,7.8493,3.0998,3.076,1.8493,0.3178,7.8493,3.0998,3.076,1.8493,0.3178,7.8493,3.0998,3.076,1.8493,0.3178,7.8493,3.0998,3.076,1.8493,0.3178,7.8493,3.0998,3.076,1.8493,0.0,28395.6429,58477.0762,0.0,0.0,0.0,157698.0,0.0,0.0,,,0.0,0.0,0.0,,,62482.5,,69136.0714,0.0,19305.0,148500.0,0.0,0.0,0.0,0.0,41184.0783,0.0,0.0,0,0.0,0.0,1,3,0,0,1,2
4,0.5,0.5,3,3,0.1667,1,1,0.8333,1,1,5,0,0.0,0,0.0,0,0,0.0,0,0,6,1,0.0,0,0,0.8575,2.8932,2.1187,0.9012,2.5986,0.8575,2.8932,2.1187,0.9012,2.5986,0.8575,2.8932,2.1187,0.9012,2.5986,0.8575,2.8932,2.1187,0.9012,2.5986,0.8575,2.8932,2.1187,0.9012,2.5986,130738.5,209084.25,264456.1574,,,,655510.5,,0.0,,,0.0,0.0,0.0,,,371250.0,,470424.0,,112500.0,945000.0,0.0,0.0,0.0,0.0,317857.683,0.0,0.0,0,0.0,0.0,1,1,1,4,0,0


In [30]:
%%time
fname = "bureau"
fname = os.path.join("features", "{}.pkl.bz2".format(fname))
df_feat.to_pickle(fname, compression="bz2")
print("Store features completed!")

Store features completed!
Wall time: 11.1 s
