In [1]:
from datetime import datetime
from itertools import combinations
import os
import re

import pandas as pd
import polars as pl
import numpy as np
import matplotlib as plt
import seaborn as sns
import lightgbm as lgb
import sklearn as sk
import mlflow
from feature_engine.encoding import CountFrequencyEncoder, WoEEncoder, RareLabelEncoder
from feature_engine.selection import (DropHighPSIFeatures, SelectByInformationValue, 
    SelectBySingleFeaturePerformance, SelectByTargetMeanPerformance, ProbeFeatureSelection)
from hyperopt import STATUS_OK, Trials, fmin, hp, tpe
from hyperopt.pyll import scope
from mlflow.entities import ViewType
from mlflow.tracking import MlflowClient
from sklearn.feature_selection import RFECV
from sklearn.metrics import accuracy_score, confusion_matrix, f1_score, roc_auc_score
from sklearn.model_selection import StratifiedKFold, cross_val_score, train_test_split

import data_proc as dp

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 500)

In [2]:
base, X, y = dp.load_data('data/train_v3.parquet')

base.shape

(1526659, 3)

In [3]:
X = pd.read_parquet('data/train_v4.parquet')

In [3]:
X.shape

(1526659, 546)

In [7]:
[x for x in X.columns if 'week' in x]

['pipeline-3__decision_day_of_week_1',
 'pipeline-3__decision_day_of_week_2',
 'pipeline-3__decision_day_of_week_3',
 'pipeline-3__decision_day_of_week_4',
 'pipeline-3__decision_day_of_week_5',
 'pipeline-3__decision_day_of_week_6',
 'pipeline-3__decision_day_of_week_7']

In [4]:
cat_cols_base = list(X.select_dtypes("category").columns)
num_cols_base = list(X.select_dtypes(exclude="category").columns)

# Feature Engineering

In [32]:
X['avginstallast24m_3658937A'].describe() , X['avgpmtlast12m_4525200A'].describe()

(count    901784.000000
 mean       5401.586914
 std        6531.562500
 min           0.000000
 25%        2528.400146
 50%        4068.600098
 75%        6551.800293
 max      496148.812500
 Name: avginstallast24m_3658937A, dtype: float64,
 count    499672.000000
 mean       6403.573242
 std        9248.010742
 min           0.000000
 25%        2590.199951
 50%        4417.600098
 75%        7521.000000
 max      495910.406250
 Name: avgpmtlast12m_4525200A, dtype: float64)

In [41]:
for col in [x for x in X.columns if 'annuity' in x]:
    print(X[col].describe())

count    1.526655e+06

mean     1.435775e+03

std      2.807021e+03

min      0.000000e+00

25%      0.000000e+00

50%      0.000000e+00

75%      2.029400e+03

max      8.750000e+04

Name: annuitynextmonth_57A, dtype: float64

count    1.220640e+06

mean     3.460657e+04

std      9.499800e+04

min      0.000000e+00

25%      4.000000e+03

50%      1.075853e+04

75%      3.060000e+04

max      2.092309e+07

Name: maxannuity_159A, dtype: float64

count    1.217920e+06

mean     5.674027e+03

std      4.148343e+03

min      0.000000e+00

25%      3.012200e+03

50%      4.834600e+03

75%      7.298800e+03

max      1.051302e+05

Name: max_annuity_853A, dtype: float64

count    1.217920e+06

mean     3.379311e+03

std      2.125500e+03

min      0.000000e+00

25%      2.116200e+03

50%      3.006733e+03

75%      4.168300e+03

max      1.030000e+05

Name: mean_annuity_853A, dtype: float64

count    1.049002e+06

mean     5.483421e+03

std      4.094210e+03

min      0.000000e+00

25%     

In [5]:
X['debt_credit_ratio'] = X['currdebt_22A'] / X['credamount_770A']

In [6]:
X['credit_annuity_ratio'] = X['annuitynextmonth_57A'] / X['credamount_770A']

In [7]:
X['annuity_to_max_installment_ratio'] = X['maxinstallast24m_3658928A'] / X['annuitynextmonth_57A']

In [8]:
X['avg_pmt_instl_diff'] = X['avgpmtlast12m_4525200A'] - X['avginstallast24m_3658937A']

In [10]:
params = {
    "boosting_type": "gbdt",
    "objective": "binary",
    "metric": "auc",
    "max_depth": 11,
    "num_leaves": 144,
    "learning_rate": 0.05,
    "feature_fraction": 0.54,
    "bagging_fraction": 0.8,
    "bagging_freq": 5,
    "n_estimators": 1000,
    "verbose": -1,
    "min_child_weight": 1.8,
    "reg_alpha": 5,
    "reg_lambda": 100,
    "colsample_bytree": 0.633,
    "feature_pre_filter": False,
    "min_child_samples": 237,
    "subsample": 0.52,
}

In [20]:
train_data = lgb.Dataset(X, label=y)

In [21]:
eval_result = {}
res = lgb.cv(params, train_data, callbacks=[lgb.log_evaluation(200), lgb.early_stopping(5), lgb.record_evaluation(eval_result)], return_cvbooster=True)



Training until validation scores don't improve for 5 rounds
[200]	cv_agg's valid auc: 0.855658 + 0.00108209
[400]	cv_agg's valid auc: 0.859957 + 0.00078793
Early stopping, best iteration is:
[571]	cv_agg's valid auc: 0.861245 + 0.000675882


[200]	cv_agg's valid auc: 0.854641 + 0.000911432
[400]	cv_agg's valid auc: 0.859048 + 0.000691597
[600]	cv_agg's valid auc: 0.860499 + 0.000502882
Early stopping, best iteration is:
[627]	cv_agg's valid auc: 0.860611 + 0.000517997

# Experiment 1
full data no encoding

In [5]:
params = {
    "boosting_type": "gbdt",
    "objective": "binary",
    "metric": "auc",
    "max_depth": 3,
    "num_leaves": 31,
    "learning_rate": 0.05,
    "feature_fraction": 0.9,
    "bagging_fraction": 0.8,
    "bagging_freq": 5,
    "n_estimators": 1000,
    "verbose": -1,
}

In [36]:
train_data = lgb.Dataset(X.drop(columns=['debt_credit_ratio']), label=y)

In [37]:
eval_result = {}
res = lgb.cv(params, train_data, callbacks=[lgb.log_evaluation(50), lgb.early_stopping(10), lgb.record_evaluation(eval_result)], return_cvbooster=True)




Training until validation scores don't improve for 10 rounds

[50]	cv_agg's valid auc: 0.831853 + 0.0020459

[100]	cv_agg's valid auc: 0.843353 + 0.00170415

[150]	cv_agg's valid auc: 0.847779 + 0.00162608

[200]	cv_agg's valid auc: 0.850054 + 0.00145959

[250]	cv_agg's valid auc: 0.851437 + 0.00148015

[300]	cv_agg's valid auc: 0.852327 + 0.00138311

[350]	cv_agg's valid auc: 0.85301 + 0.00133786

[400]	cv_agg's valid auc: 0.853474 + 0.00127998

[450]	cv_agg's valid auc: 0.853795 + 0.00130686

[500]	cv_agg's valid auc: 0.853995 + 0.00124696

[550]	cv_agg's valid auc: 0.854211 + 0.00122698

[600]	cv_agg's valid auc: 0.854384 + 0.00126986

[650]	cv_agg's valid auc: 0.854505 + 0.00131452

Early stopping, best iteration is:

[672]	cv_agg's valid auc: 0.854563 + 0.00132022


# Fillna

In [9]:
def fill_employedtotal(x):
    if x == 'LESS_ONE':
        return 1
    if x == 'MORE_ONE':
        return 2
    if x == 'MORE_FIVE':
        return 3

In [10]:
for col in [x for x in cat_cols_base if 'employedtotal' in x]:
    X[col] = X[col].apply(fill_employedtotal).astype(float).fillna(0).astype(int)

X = X.drop(columns=['last_contaddr_matchlist_1032L'])
cat_cols_base = list(X.select_dtypes("category").columns)

fill_empty = ['housetype',
'bankacctype',
'credacc_status',
'relationshiptoclient',
'remitter',
'familystate',
'cardtype',
'typesuite',
'empl_industry',
'sex',
'contaddr_smempladdr',
'requesttype',
'incometype',
'credtype',
'inittransactioncode',
'disbursement',
'type_25L',
'role_1084L',
'maritalst',
'description',
'education',
'opencred',
'paytype',
'rejectreason',
'cancelreason',
'postype',
'lastst',
'twobodfilling',
'contaddr_matchlist',
'status_'
]

for pat in fill_empty:
    print(pat)
    for col in [x for x in cat_cols_base if pat in x]:
        if '' not in X[col].cat.categories:
            X[col] = X[col].cat.add_categories('')
        X[col] = X[col].fillna('')

fill_false = ['equality',
'isdebitcard',
'safeguaranty',
'isbidproduct']

for pat in fill_false:
    print(pat)
    for col in [x for x in cat_cols_base if pat in x]:
        X[col] = X[col].fillna(False)

for col in cat_cols_base:
    if '' not in X[col].cat.categories:
        X[col] = X[col].cat.add_categories('')
    X[col] = X[col].fillna('')

housetype
bankacctype
credacc_status
relationshiptoclient
remitter
familystate
cardtype
typesuite
empl_industry
sex
contaddr_smempladdr
requesttype
incometype
credtype
inittransactioncode
disbursement
type_25L
role_1084L
maritalst
description
education
opencred
paytype
rejectreason
cancelreason
postype
lastst
twobodfilling
contaddr_matchlist
status_
equality
isdebitcard
safeguaranty
isbidproduct


In [11]:
cat_cols_base = list(X.select_dtypes("category").columns)

In [12]:
num_cols_base = list(X.select_dtypes(exclude="category").columns)

for col in [x for x in num_cols_base]:
    if X[col].nunique() == 1:
        X.drop(columns=[col], inplace=True)

In [13]:
null_median = [
    'openingdate',
    'amount',
    'revolvingaccount',
    'pmtaverage',
    'pmtcount',
    'inittransactionamount_650A',
    'responsedate',
    'dtlastpmt',
    'byoccupationinc',
    'processingdate',
    'pmtscount',
    'pmtssum_45A',
    'birthdate',
    'pmtnum',
    'numinstpaidlastcontr',
    'personindex',
    'maininc',
    'pctinstlsallpaidlat',
    'cntpmts24',
    'pctinstlsallpaidear',
    'mainoccupationinc',
    'birth',
    'cntincpaycont9m',
    'numincomingpmts',
    'lastactivateddate_801D',
    'posf'
]

null_mean = [
    'employedfrom',
    'maxpmtlast3m_4525190A',
    'maxlnamtstart6m_4525199A',
    'avgpmtlast12m_4525200A',
    'dtlastpmtallstes_4499206D',
    'firstclxcampaign_1125D',
    'avgdbdtollast24m_4525197P',
    'numinstpaid',
    'numinstlsallpaid',
    'numinstregular',
    'numinstpaidearl',
    'outstandingdebt',
    'currdebt',
    'maxinstallast24m',
    'avginstallast24m',
    'amtinstpaidbefduel24m',
    'sumoutstandtotal',
    'downpmt',
    'credacc_credlmt',
    'maxdebt4',
    'price_1097A',
    'eir_270L'
]

null_min = [
    'validfrom',
    'lastdelinqdate',
    'assignmentdate',
    'datelastunpaid',
    'approvaldate',
    'firstnonzeroinstldate',
    'firstdatedue',
    'dpd',
    'lastrejectdate_50D',
    'numinsttopaygr',
    'daysoverduetolerancedd_3976961L',
    'annuity',
    'creationdate',
    'lastapplicationdate'
]

null_max = [
    'datefirstoffer',
    'dateactivated_425D',
    'approvaldate_319D',
    'lastapprdate'
]

numeric_cols_base = list(X.select_dtypes(include=np.number).columns)

In [14]:
def fill_min(data: pd.DataFrame):
    for pat in null_min:
        print(pat)
        for col in [x for x in numeric_cols_base if pat in x]:
            data[col] = data[col].fillna(data[col].min())
    data[['maxdpdinstldate_3546855D']] = data[['maxdpdinstldate_3546855D']].fillna(data['maxdpdinstldate_3546855D'].min())


def fill_max(data: pd.DataFrame):
    for pat in null_max:
        print(pat)
        for col in [x for x in numeric_cols_base if pat in x]:
            data[col] = data[col].fillna(data[col].max())


def fill0(data: pd.DataFrame):
    cols_to_fill = [
        x for x in data.columns if (
            'deduc' in x or 'balanc' in x or 'hisbal' in x or 'credacc_transactions' in x or 'totinstallast1m' in x
        ) and x in numeric_cols_base
    ] + ['sumoutstandtotalest_4493215A']
    data[cols_to_fill] = data[cols_to_fill].fillna(0)

def fill_neg_1(data):
    cols_to_fill = [
        x for x in data.columns if ('num_group1' in x or 'contractssum' in x or 'avglnamtstart24m' in x) and x in numeric_cols_base
    ]
    data[cols_to_fill] = data[cols_to_fill].fillna(-1)

def fill_num(data):
    #data[[x for x in data.columns if 'childnum' in x and x in numeric_cols_base]] = data[[x for x in data.columns if 'childnum' in x and x in numeric_cols_base]].fillna(6)
    #data[[x for x in data.columns if 'recorddate' in x and x in numeric_cols_base]] = data[[x for x in data.columns if 'recorddate' in x and x in numeric_cols_base]].fillna(15)
    #fill_min(data)
    fill_max(data)
    fill0(data)
    fill_neg_1(data)
    for pat in null_median:
        print(pat)
        for col in [x for x in numeric_cols_base if pat in x]:
            data[col] = data[col].fillna(data[col].median())
    for pat in null_mean:
        print(pat)
        for col in [x for x in numeric_cols_base if pat in x]:
            data[col] = data[col].fillna(data[col].mean())
    #fill the rest with median
    for col in numeric_cols_base:
        data[col] = data[col].fillna(data[col].median())

In [15]:
fill_num(X)

datefirstoffer
dateactivated_425D
approvaldate_319D
lastapprdate
openingdate
amount
revolvingaccount
pmtaverage
pmtcount
inittransactionamount_650A
responsedate
dtlastpmt
byoccupationinc
processingdate
pmtscount
pmtssum_45A
birthdate
pmtnum
numinstpaidlastcontr
personindex
maininc
pctinstlsallpaidlat
cntpmts24
pctinstlsallpaidear
mainoccupationinc
birth
cntincpaycont9m
numincomingpmts
lastactivateddate_801D
posf
employedfrom


  return dtype.type(n)
  return umr_sum(a, axis, dtype, out, keepdims, initial, where)
  the_mean = the_sum / count if count > 0 else np.nan


maxpmtlast3m_4525190A
maxlnamtstart6m_4525199A
avgpmtlast12m_4525200A
dtlastpmtallstes_4499206D
firstclxcampaign_1125D
avgdbdtollast24m_4525197P
numinstpaid
numinstlsallpaid
numinstregular
numinstpaidearl
outstandingdebt
currdebt
maxinstallast24m
avginstallast24m
amtinstpaidbefduel24m
sumoutstandtotal
downpmt
credacc_credlmt
maxdebt4
price_1097A
eir_270L


In [20]:
train_data = lgb.Dataset(X, label=y)
eval_result = {}
res = lgb.cv(params, train_data, callbacks=[lgb.log_evaluation(50), lgb.early_stopping(10), lgb.record_evaluation(eval_result)], return_cvbooster=True)




Training until validation scores don't improve for 10 rounds

[50]	cv_agg's valid auc: 0.799259 + 0.00249848

[100]	cv_agg's valid auc: 0.821241 + 0.0022

[150]	cv_agg's valid auc: 0.829888 + 0.00203137

[200]	cv_agg's valid auc: 0.834831 + 0.0020448

[250]	cv_agg's valid auc: 0.83801 + 0.00202485

[300]	cv_agg's valid auc: 0.840176 + 0.00194495

[350]	cv_agg's valid auc: 0.841812 + 0.00192079

[400]	cv_agg's valid auc: 0.843036 + 0.00191219

[450]	cv_agg's valid auc: 0.844066 + 0.00188068

[500]	cv_agg's valid auc: 0.844927 + 0.00177327

[550]	cv_agg's valid auc: 0.845647 + 0.00175757

[600]	cv_agg's valid auc: 0.846259 + 0.00174041

[650]	cv_agg's valid auc: 0.846821 + 0.00170665

[700]	cv_agg's valid auc: 0.847251 + 0.00167268

[750]	cv_agg's valid auc: 0.847757 + 0.00168904

[800]	cv_agg's valid auc: 0.848141 + 0.00168106

[850]	cv_agg's valid auc: 0.848512 + 0.00163273

[900]	cv_agg's valid auc: 0.848826 + 0.00166197

[950]	cv_agg's valid auc: 0.849143 + 0.00166913

[1000]	cv_agg'

In [16]:
rle = RareLabelEncoder(n_categories = 1)
X = rle.fit_transform(X)

print(X.shape)
to_rm = []
for col in cat_cols_base:
    if base[X[col] == 'Rare']['target'].nunique() == 1:
        to_rm += list(X[X[col] == 'Rare'].index)
X = X.drop(index=to_rm)
print(X.shape)

for col in cat_cols_base:
    X[col] = X[col].cat.remove_unused_categories()

y = y[X.index]
base = base.iloc[X.index, :]

woe = WoEEncoder()
X = woe.fit_transform(X, y)



(1526659, 541)
(1526537, 541)


In [17]:
import pickle

with open('rle.pickle', 'wb') as f:
    pickle.dump(rle, f)

with open('woe.pickle', 'wb') as f:
    pickle.dump(woe, f)

In [19]:
list(X.columns)

['month_decision',
 'weekday_decision',
 'assignmentdate_238D',
 'assignmentdate_4527235D',
 'birthdate_574D',
 'contractssum_5085716L',
 'dateofbirth_337D',
 'days120_123L',
 'days180_256L',
 'days30_165L',
 'days360_512L',
 'days90_310L',
 'description_5085714M',
 'education_1103M',
 'education_88M',
 'firstquarter_103L',
 'fourthquarter_440L',
 'maritalst_385M',
 'maritalst_893M',
 'numberofqueries_373L',
 'pmtaverage_3A',
 'pmtaverage_4527227A',
 'pmtcount_4527229L',
 'pmtcount_693L',
 'pmtscount_423L',
 'pmtssum_45A',
 'requesttype_4525192L',
 'responsedate_1012D',
 'responsedate_4527233D',
 'responsedate_4917613D',
 'secondquarter_766L',
 'thirdquarter_1082L',
 'actualdpdtolerance_344P',
 'amtinstpaidbefduel24m_4187115A',
 'annuity_780A',
 'annuitynextmonth_57A',
 'applicationcnt_361L',
 'applications30d_658L',
 'applicationscnt_1086L',
 'applicationscnt_464L',
 'applicationscnt_629L',
 'applicationscnt_867L',
 'avgdbddpdlast24m_3658932P',
 'avgdbddpdlast3m_4187120P',
 'avgdbdtol

In [20]:
train_data = lgb.Dataset(X, label=y)
eval_result = {}
res = lgb.cv(params, train_data, callbacks=[lgb.log_evaluation(200), lgb.early_stopping(10), lgb.record_evaluation(eval_result)], return_cvbooster=True)



Training until validation scores don't improve for 10 rounds
[200]	cv_agg's valid auc: 0.853894 + 0.00152371
[400]	cv_agg's valid auc: 0.858603 + 0.00142723
[600]	cv_agg's valid auc: 0.860113 + 0.00139288
[800]	cv_agg's valid auc: 0.860659 + 0.00140393
Early stopping, best iteration is:
[900]	cv_agg's valid auc: 0.860831 + 0.00138246


In [19]:
pd.concat([base, X], axis=1).to_parquet('data/train_v3_filled_woe.parquet', index=False)

In [31]:
def get_lgbm_varimp(model, train_columns, max_vars=50):
    cv_varimp_df = pd.DataFrame([train_columns, model.feature_importance()]).T
    cv_varimp_df.columns = ['feature_name', 'varimp']
    cv_varimp_df.sort_values(by='varimp', ascending=False, inplace=True)
    cv_varimp_df = cv_varimp_df.iloc[0:max_vars]   
    return cv_varimp_df

In [35]:
imps = get_lgbm_varimp(res['cvbooster'].boosters[0], X.columns)
for i in range(1,len(res['cvbooster'].boosters)):
    imps.add(get_lgbm_varimp(res['cvbooster'].boosters[i], X.columns))

In [36]:
imps

Unnamed: 0,feature_name,varimp
34,annuity_780A,1459
369,mean_dateofcredstart_739D,1292
431,max_birth_259D,1217
70,credamount_770A,1068
377,mean_refreshdate_3813885D,1022
78,disbursedcredamount_1113A,983
172,price_1097A,972
365,last_refreshdate_3813885D,949
209,mean_annuity_853A,932
432,max_empl_employedfrom_271D,910


In [37]:
X[imps['feature_name']]

Unnamed: 0,annuity_780A,mean_dateofcredstart_739D,max_birth_259D,credamount_770A,mean_refreshdate_3813885D,disbursedcredamount_1113A,price_1097A,last_refreshdate_3813885D,mean_annuity_853A,max_empl_employedfrom_271D,mean_dateofcredend_289D,dateofbirth_337D,max_mainoccupationinc_384A,mean_totalamount_6A,max_dateofcredstart_739D,pmtssum_45A,mean_mainoccupationinc_437A,max_employedfrom_700D,maxannuity_159A,mean_employedfrom_700D,lastrejectdate_50D,max_dateofcredstart_181D,mean_monthlyinstlamount_332A,max_dateofcredend_289D,birthdate_574D,max_annuity_853A,mean_amount_4527230A,max_overdueamountmax2date_1002D,max_amount_4527230A,max_totalamount_6A,pmtnum_254L,mean_monthlyinstlamount_674A,max_monthlyinstlamount_332A,max_numberofoverdueinstlmaxdat_148D,last_mainoccupationinc_437A,pctinstlsallpaidearl3d_427L,max_dateofcredend_353D,lastrejectcredamount_222A,mean_credamount_590A,last_annuity_853A,firstclxcampaign_1125D,lastapprcredamount_781A,mean_numberofoverdueinstlmaxdat_148D,mean_residualamount_856A,mean_overdueamountmax2date_1142D,max_overdueamountmax2date_1142D,max_dtlastpmt_581D,mean_numberofoverdueinstlmaxdat_641D,last_credamount_590A,max_mainoccupationinc_437A
0,1917.599976,-506.0,-11874,30000.000000,-51.0,30000.000000,34464.976562,-49.0,3006.733398,-475.0,453.0,-15632.0,10800.0,39015.332031,-202.0,8391.900391,37100.000000,-2608.0,0.000000,-3036.0,-436.0,-518.0,3720.333496,603.0,-15432.0,4834.600098,1645.0,-828.0,2635.0,75128.601562,24.0,2558.708252,5367.653320,-709.0,40000.0,0.619141,-100.0,33998.000000,32177.738281,4624.600098,-1116.0,24774.0,-1589.0,8729.799805,-347.0,-244.0,-210.0,-320.0,40000.000000,57400.0
1,3134.000000,-506.0,-22435,19999.800781,-51.0,19999.800781,34464.976562,-49.0,3006.733398,-3718.0,453.0,-15632.0,10000.0,39015.332031,-202.0,8391.900391,37100.000000,-2608.0,0.000000,-3036.0,-436.0,-518.0,3720.333496,603.0,-15432.0,4834.600098,1645.0,-828.0,2635.0,75128.601562,18.0,2558.708252,5367.653320,-709.0,40000.0,0.619141,-100.0,33998.000000,32177.738281,4624.600098,-1116.0,24774.0,-1589.0,8729.799805,-347.0,-244.0,-210.0,-320.0,40000.000000,57400.0
2,4937.000000,-506.0,-16105,78000.000000,-51.0,78000.000000,34464.976562,-49.0,1161.300049,-3244.0,453.0,-15632.0,14000.0,39015.332031,-202.0,8391.900391,8200.000000,-3244.0,0.000000,-3244.0,-2102.0,-518.0,3720.333496,603.0,-15432.0,1682.400024,1645.0,-828.0,2635.0,75128.601562,36.0,2558.708252,5367.653320,-709.0,8200.0,0.619141,-100.0,10000.000000,13000.000000,1682.400024,-1116.0,24774.0,-1589.0,8729.799805,-347.0,-244.0,-210.0,-320.0,16000.000000,8200.0
3,4643.600098,-506.0,-9286,40000.000000,-51.0,40000.000000,34464.976562,-49.0,6140.000000,-233.0,453.0,-15632.0,10000.0,39015.332031,-202.0,8391.900391,11000.000000,-233.0,0.000000,-233.0,4.0,-518.0,3720.333496,603.0,-15432.0,6140.000000,1645.0,-828.0,2635.0,75128.601562,12.0,2558.708252,5367.653320,-709.0,11000.0,0.619141,-100.0,59999.800781,59999.800781,6140.000000,-1116.0,24774.0,-1589.0,8729.799805,-347.0,-244.0,-210.0,-320.0,59999.800781,11000.0
4,3390.199951,-506.0,-9134,44000.000000,-51.0,44000.000000,34464.976562,-49.0,2556.600098,-1481.0,453.0,-15632.0,24000.0,39015.332031,-202.0,8391.900391,16000.000000,-2608.0,0.000000,-3036.0,-436.0,-518.0,3720.333496,603.0,-15432.0,2556.600098,1645.0,-828.0,2635.0,75128.601562,24.0,2558.708252,5367.653320,-709.0,16000.0,0.619141,-100.0,33998.000000,40000.000000,2556.600098,-1116.0,24774.0,-1589.0,8729.799805,-347.0,-244.0,-210.0,-320.0,40000.000000,16000.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1526654,3675.400146,-637.0,-22193,30000.000000,-144.0,30000.000000,0.000000,-49.0,4890.250000,-1432.0,94.0,-22192.0,40000.0,25252.519531,-637.0,8391.900391,23116.666016,-2090.0,75521.906250,-7020.0,-1040.0,-362.0,3773.600098,94.0,-15432.0,30875.000000,1645.0,-1482.0,2635.0,77607.804688,12.0,5634.094727,3773.600098,-1482.0,20000.0,0.911621,-27.0,150000.000000,33377.500000,4624.600098,-1610.0,20020.0,-2798.0,8729.799805,-347.0,-244.0,3.0,-320.0,40000.000000,50000.0
1526655,7088.600098,-371.0,-25541,100000.000000,-144.0,40739.539062,34464.976562,-49.0,6472.866699,-1432.0,360.0,-25536.0,36800.0,94845.796875,-371.0,8391.900391,38960.000000,-2608.0,117251.601562,-3036.0,-436.0,-616.0,6191.600098,360.0,-15432.0,12809.200195,1645.0,-828.0,2635.0,121868.601562,24.0,20823.824219,6191.600098,-709.0,50000.0,0.945801,-7.0,33998.000000,74740.429688,12809.200195,-1679.0,0.0,-1589.0,8729.799805,-347.0,-244.0,-7.0,-320.0,100000.000000,50000.0
1526656,7788.800293,-1216.0,-15771,60000.000000,-144.0,60000.000000,0.000000,-49.0,4597.466797,-1432.0,125.0,-15768.0,30000.0,10662.700195,-336.0,8391.900391,29333.333984,-977.0,6600.000000,-977.0,-436.0,-413.0,2248.766113,153.0,-15432.0,9048.000000,1645.0,-828.0,2635.0,17143.400391,11.0,1721.000000,4497.532227,-709.0,34000.0,0.666504,-321.0,33998.000000,33232.000000,9048.000000,-1116.0,3998.0,-1589.0,10273.875977,-145.0,-145.0,-350.0,-145.0,80000.000000,34000.0
1526657,1195.400024,-292.0,-25814,6000.000000,-144.0,6000.000000,0.000000,-49.0,1790.355591,-1432.0,621.0,-25808.0,30000.0,29781.183594,-292.0,8391.900391,35625.000000,-2608.0,163202.000000,-3036.0,-2656.0,-691.0,2827.199951,621.0,-15432.0,5981.399902,1645.0,-1846.0,2635.0,126780.000000,6.0,7102.968750,2827.199951,-1840.0,76000.0,0.696289,-293.0,2198.000000,28521.111328,5981.399902,-1202.0,0.0,-1932.0,8729.799805,-347.0,-244.0,-293.0,-320.0,123800.000000,76000.0


In [39]:
imps.to_csv('data/imps_v3.csv', index=False)

# Checkpoint

In [1]:
from sklearn.neighbors import KNeighborsRegressor
from datetime import datetime
from itertools import combinations
import os
import re

import pandas as pd
import polars as pl
import numpy as np
import matplotlib as plt
import seaborn as sns
import lightgbm as lgb
import sklearn as sk
import mlflow
from feature_engine.encoding import CountFrequencyEncoder, WoEEncoder, RareLabelEncoder
from feature_engine.selection import (DropHighPSIFeatures, SelectByInformationValue, SelectByShuffling, SmartCorrelatedSelection,
    SelectBySingleFeaturePerformance, SelectByTargetMeanPerformance, ProbeFeatureSelection, RecursiveFeatureAddition)
from hyperopt import STATUS_OK, Trials, fmin, hp, tpe
from hyperopt.pyll import scope
from mlflow.entities import ViewType
from mlflow.tracking import MlflowClient
from sklearn.feature_selection import RFECV
from sklearn.metrics import accuracy_score, confusion_matrix, f1_score, roc_auc_score
from sklearn.model_selection import StratifiedKFold, cross_val_score, train_test_split
from sklearn.preprocessing import normalize
from sklearn.linear_model import LogisticRegression
import data_proc as dp

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 500)

In [2]:
base, X, y = dp.load_data('data/train_v3_filled_woe.parquet')

X.shape

(1526537, 542)

In [3]:
cat_cols_base = list(X.select_dtypes("category").columns)
num_cols_base = list(X.select_dtypes(exclude="category").columns)

len(cat_cols_base), len(num_cols_base)

(0, 542)

In [6]:
imps = pd.read_csv('data/imps_v3.csv')

In [13]:
list(imps['feature_name'])

['annuity_780A',
 'mean_dateofcredstart_739D',
 'max_birth_259D',
 'credamount_770A',
 'mean_refreshdate_3813885D',
 'disbursedcredamount_1113A',
 'price_1097A',
 'last_refreshdate_3813885D',
 'mean_annuity_853A',
 'max_empl_employedfrom_271D',
 'mean_dateofcredend_289D',
 'dateofbirth_337D',
 'max_mainoccupationinc_384A',
 'mean_totalamount_6A',
 'max_dateofcredstart_739D',
 'pmtssum_45A',
 'mean_mainoccupationinc_437A',
 'max_employedfrom_700D',
 'maxannuity_159A',
 'mean_employedfrom_700D',
 'lastrejectdate_50D',
 'max_dateofcredstart_181D',
 'mean_monthlyinstlamount_332A',
 'max_dateofcredend_289D',
 'birthdate_574D',
 'max_annuity_853A',
 'mean_amount_4527230A',
 'max_overdueamountmax2date_1002D',
 'max_amount_4527230A',
 'max_totalamount_6A',
 'pmtnum_254L',
 'mean_monthlyinstlamount_674A',
 'max_monthlyinstlamount_332A',
 'max_numberofoverdueinstlmaxdat_148D',
 'last_mainoccupationinc_437A',
 'pctinstlsallpaidearl3d_427L',
 'max_dateofcredend_353D',
 'lastrejectcredamount_222A',

In [7]:
norm_df=X[imps['feature_name']].astype(float)
norm_df = (norm_df-norm_df.mean())/norm_df.std()

In [None]:
X['mean_dateofcredstart_739D'].astype(float).sum()

In [8]:
neigh = KNeighborsRegressor(n_neighbors=500, n_jobs=-1)
neigh.fit(norm_df, base['target'])

In [10]:
import pickle
with open('neigh.pickle', 'wb') as f:
    pickle.dump(neigh, f)

In [63]:
knn_target_avg = neigh.predict(norm_df)

In [11]:
neigh.get_params()

{'algorithm': 'auto',
 'leaf_size': 30,
 'metric': 'minkowski',
 'metric_params': None,
 'n_jobs': -1,
 'n_neighbors': 500,
 'p': 2,
 'weights': 'uniform'}

In [64]:
X['knn500_target'] = knn_target_avg
X['knn500_target'].describe()

  X['knn500_target'] = knn_target_avg


count    1.526537e+06
mean     2.660921e-02
std      2.207983e-02
min      0.000000e+00
25%      1.000000e-02
50%      2.200000e-02
75%      3.600000e-02
max      2.480000e-01
Name: knn500_target, dtype: float64

# Feature Engineering TODO
- yearly interest rate x 
- knn500 mean target - need to constrain the number of cols for distance metric
- feature selection with ridge regression on encoded and normalized data
- debt_credit_ratio_None: grouped by SK_ID_CURR, the sum of all credit debt (AMT_CREDIT_SUM_DEBT) over the sum of all credit (AM_CREDIT_SUM). x
credit_annuity_ratio: AMT_CREDIT / AMT_ANNUITY x
- credit_downpayment: AMT_GOOD_PRICE - AMT_CREDIT (downpmt should be here somewhere) x (already in data)
- nstallment_payment_ratio_1000_mean_mean: Looking only at installment payments where DAYS_INSTALLMENT>-1000, take the mean of AMT_PAYMENT - AMT_INSTALMENT, grouped first by SK_ID_PREV and then by SK_ID_CURR.
- annuity_to_max_installment_ratio: AMT_ANNUITY / (maximum installment from the installments_payments table, grouped by SK_ID_CURR).
- target permutation feaure selection?
- Annuity x CNT payments / Amount of Credit = (1+ir) ^ (CNT payment /12)

In [66]:
pd.concat([base, X], axis=1).to_parquet('data/train_v3_filled_woe.parquet', index=False)

Now you have to merge the datasets somehow. knn500 will have a few nulls but should be possible to merge

In [2]:
# uncomment for unfilled base data
base, X, y = dp.load_data('data/train_v3.parquet')

In [3]:
X4 = pd.read_parquet('data/train_v4.parquet')

In [73]:
X4.head()

Unnamed: 0,pipeline-1__delta_datefirstoffer_1144D_date_decision,pipeline-1__delta_datelastinstal40dpd_247D_date_decision,pipeline-1__delta_firstclxcampaign_1125D_date_decision,pipeline-1__delta_firstdatedue_489D_date_decision,pipeline-1__delta_lastapplicationdate_877D_date_decision,pipeline-1__delta_lastapprdate_640D_date_decision,pipeline-1__delta_lastrejectdate_50D_date_decision,pipeline-1__delta_maxdpdinstldate_3546855D_date_decision,pipeline-1__delta_validfrom_1069D_date_decision,pipeline-1__delta_assignmentdate_238D_date_decision,pipeline-1__delta_responsedate_1012D_date_decision,pipeline-1__delta_responsedate_4917613D_date_decision,pipeline-1__delta_dateofcredend_289D_mean_date_decision,pipeline-1__delta_dateofcredstart_739D_mean_date_decision,pipeline-1__delta_lastupdate_1112D_mean_date_decision,pipeline-1__delta_numberofoverdueinstlmaxdat_148D_mean_date_decision,pipeline-1__delta_numberofoverdueinstlmaxdat_641D_mean_date_decision,pipeline-1__delta_overdueamountmax2date_1142D_mean_date_decision,pipeline-1__delta_refreshdate_3813885D_mean_date_decision,pipeline-1__delta_approvaldate_319D_mean_date_decision,pipeline-1__delta_creationdate_885D_mean_date_decision,pipeline-1__delta_dtlastpmtallstes_3545839D_mean_date_decision,pipeline-1__delta_employedfrom_700D_mean_birth_259D_mean,pipeline-1__delta_employedfrom_700D_mean_date_decision,pipeline-1__delta_openingdate_313D_mean_date_decision,pipeline-1__delta_empl_employedfrom_271D_mean_date_decision,pipeline-1__delta_deductiondate_4917603D_mean_date_decision,pipeline-1__delta_processingdate_168D_mean_date_decision,pipeline-1__delta_date_decision_birth_259D_mean,pipeline-2__case_id,pipeline-2__MONTH,pipeline-2__WEEK_NUM,pipeline-2__amtinstpaidbefduel24m_4187115A,pipeline-2__annuity_780A,pipeline-2__annuitynextmonth_57A,pipeline-2__applications30d_658L,pipeline-2__applicationscnt_1086L,pipeline-2__applicationscnt_867L,pipeline-2__avgdpdtolclosure24_3658938P,pipeline-2__avginstallast24m_3658937A,pipeline-2__avglnamtstart24m_4525187A,pipeline-2__avgmaxdpdlast9m_3716943P,pipeline-2__avgpmtlast12m_4525200A,pipeline-2__clientscnt_1022L,pipeline-2__clientscnt_533L,pipeline-2__clientscnt_887L,pipeline-2__cntincpaycont9m_3716944L,pipeline-2__cntpmts24_3658933L,pipeline-2__currdebtcredtyperange_828A,pipeline-2__daysoverduetolerancedd_3976961L,pipeline-2__downpmt_116A,pipeline-2__eir_270L,pipeline-2__homephncnt_628L,pipeline-2__lastapprcredamount_781A,pipeline-2__lastrejectcredamount_222A,pipeline-2__maininc_215A,pipeline-2__maxannuity_159A,pipeline-2__maxdebt4_972A,pipeline-2__maxdpdinstlnum_3546846P,pipeline-2__maxdpdlast24m_143P,pipeline-2__maxdpdlast3m_392P,pipeline-2__maxdpdlast6m_474P,pipeline-2__maxdpdtolerance_374P,pipeline-2__maxinstallast24m_3658928A,pipeline-2__maxlnamtstart6m_4525199A,pipeline-2__maxoutstandbalancel12m_4187113A,pipeline-2__maxpmtlast3m_4525190A,pipeline-2__mobilephncnt_593L,pipeline-2__numactivecreds_622L,pipeline-2__numactivecredschannel_414L,pipeline-2__numactiverelcontr_750L,pipeline-2__numcontrs3months_479L,pipeline-2__numinstls_657L,pipeline-2__numinstlswithdpd10_728L,pipeline-2__numinstlswithdpd5_4187116L,pipeline-2__numinstpaidearly5d_1087L,pipeline-2__numinstpaidlastcontr_4325080L,pipeline-2__numinstpaidlate1d_3546852L,pipeline-2__numrejects9m_859L,pipeline-2__pctinstlsallpaidearl3d_427L,pipeline-2__pctinstlsallpaidlate1d_3546856L,pipeline-2__pctinstlsallpaidlate4d_3546849L,pipeline-2__pmtnum_254L,pipeline-2__price_1097A,pipeline-2__sellerplacecnt_915L,pipeline-2__sellerplacescnt_216L,pipeline-2__totaldebt_9A,pipeline-2__totalsettled_863A,pipeline-2__totinstallast1m_4525188A,pipeline-2__contractssum_5085716L,pipeline-2__days30_165L,pipeline-2__days360_512L,pipeline-2__firstquarter_103L,pipeline-2__fourthquarter_440L,pipeline-2__pmtaverage_3A,pipeline-2__pmtscount_423L,pipeline-2__pmtssum_45A,pipeline-2__secondquarter_766L,pipeline-2__thirdquarter_1082L,pipeline-2__annualeffectiverate_199L_count,pipeline-2__annualeffectiverate_199L_mean,pipeline-2__annualeffectiverate_63L_mean,pipeline-2__contractsum_5085717L_mean,pipeline-2__credlmt_230A_mean,pipeline-2__credlmt_935A_mean,pipeline-2__debtoutstand_525A_mean,pipeline-2__instlamount_768A_mean,pipeline-2__instlamount_852A_mean,pipeline-2__monthlyinstlamount_332A_mean,pipeline-2__monthlyinstlamount_674A_mean,pipeline-2__nominalrate_281L_mean,pipeline-2__nominalrate_498L_mean,pipeline-2__numberofcontrsvalue_258L_mean,pipeline-2__numberofcontrsvalue_358L_mean,pipeline-2__numberofinstls_229L_mean,pipeline-2__numberofoutstandinstls_59L_mean,pipeline-2__numberofoverdueinstlmax_1039L_mean,pipeline-2__numberofoverdueinstlmax_1151L_mean,pipeline-2__overdueamountmax2_398A_mean,pipeline-2__prolongationcount_1120L_mean,pipeline-2__residualamount_856A_mean,pipeline-2__totalamount_6A_mean,pipeline-2__annuity_853A_mean,pipeline-2__byoccupationinc_3656910L_mean,pipeline-2__childnum_21L_mean,pipeline-2__credacc_actualbalance_314A_mean,pipeline-2__credacc_credlmt_575A_mean,pipeline-2__credacc_maxhisbal_375A_mean,pipeline-2__credacc_minhisbal_90A_mean,pipeline-2__credacc_transactions_402L_mean,pipeline-2__credamount_590A_mean,pipeline-2__currdebt_94A_mean,pipeline-2__downpmt_134A_mean,pipeline-2__mainoccupationinc_437A_mean,pipeline-2__maxdpdtolerance_577P_mean,pipeline-2__pmtnum_8L_mean,pipeline-2__revolvingaccount_394A_mean,pipeline-2__last180dayaveragebalance_704A_count,pipeline-2__amount_416A_count,pipeline-2__amount_416A_mean,pipeline-2__mainoccupationinc_384A_mean,pipeline-2__collater_typofvalofguarant_298M_mode_count,pipeline-2__collater_valueofguarantee_1124L_mean_mean,pipeline-2__collater_valueofguarantee_876L_mean_mean,pipeline-2__pmts_dpd_303P_mean_mean,pipeline-2__pmts_overdue_1140A_mean_mean,pipeline-2__pmts_overdue_1152A_mean_mean,pipeline-2__addres_district_368M_mode_count,pipeline-2__decision_day_of_month,pipeline-2__delta_overdueamountmaxdateyear_2T_mean_decision_year,pipeline-2__delta_pmts_year_1139T_mean_mean_decision_year,pipeline-2__delta_pmts_year_507T_mean_mean_decision_year,pipeline-2__delta_decision_year_birth_year,pipeline-3__credtype_322L_CAL,pipeline-3__credtype_322L_COL,pipeline-3__credtype_322L_REL,pipeline-3__credtype_322L_infrequent_sklearn,pipeline-3__disbursementtype_67L_GBA,pipeline-3__disbursementtype_67L_SBA,pipeline-3__disbursementtype_67L_infrequent,pipeline-3__disbursementtype_67L_infrequent_sklearn,pipeline-3__equalitydataagreement_891L_True,pipeline-3__equalitydataagreement_891L_nan,pipeline-3__equalitydataagreement_891L_infrequent_sklearn,pipeline-3__inittransactioncode_186L_CASH,pipeline-3__inittransactioncode_186L_POS,pipeline-3__inittransactioncode_186L_infrequent,pipeline-3__inittransactioncode_186L_infrequent_sklearn,pipeline-3__isbidproduct_1095L_True,pipeline-3__lastst_736L_A,pipeline-3__lastst_736L_D,pipeline-3__lastst_736L_K,pipeline-3__lastst_736L_T,pipeline-3__lastst_736L_nan,pipeline-3__lastst_736L_infrequent_sklearn,pipeline-3__description_5085714M_2fc785b2,pipeline-3__description_5085714M_a55475b1,pipeline-3__description_5085714M_infrequent_sklearn,pipeline-3__education_1103M_6b2ae0fa,pipeline-3__education_1103M_717ddd49,pipeline-3__education_1103M_a55475b1,pipeline-3__education_1103M_infrequent,pipeline-3__education_1103M_infrequent_sklearn,pipeline-3__maritalst_385M_3439d993,pipeline-3__maritalst_385M_a55475b1,pipeline-3__maritalst_385M_a7fcb6e5,pipeline-3__maritalst_385M_b6cabe76,pipeline-3__maritalst_385M_infrequent,pipeline-3__maritalst_385M_infrequent_sklearn,pipeline-3__requesttype_4525192L_DEDUCTION_6,pipeline-3__requesttype_4525192L_PENSION_6,pipeline-3__requesttype_4525192L_nan,pipeline-3__requesttype_4525192L_infrequent_sklearn,pipeline-3__dpdmaxdatemonth_442T_mode_1.0,pipeline-3__dpdmaxdatemonth_442T_mode_10.0,pipeline-3__dpdmaxdatemonth_442T_mode_11.0,pipeline-3__dpdmaxdatemonth_442T_mode_12.0,pipeline-3__dpdmaxdatemonth_442T_mode_2.0,pipeline-3__dpdmaxdatemonth_442T_mode_3.0,pipeline-3__dpdmaxdatemonth_442T_mode_4.0,pipeline-3__dpdmaxdatemonth_442T_mode_5.0,pipeline-3__dpdmaxdatemonth_442T_mode_6.0,pipeline-3__dpdmaxdatemonth_442T_mode_7.0,pipeline-3__dpdmaxdatemonth_442T_mode_8.0,pipeline-3__dpdmaxdatemonth_442T_mode_9.0,pipeline-3__dpdmaxdatemonth_442T_mode_nan,pipeline-3__dpdmaxdatemonth_89T_mode_1.0,pipeline-3__dpdmaxdatemonth_89T_mode_10.0,pipeline-3__dpdmaxdatemonth_89T_mode_11.0,pipeline-3__dpdmaxdatemonth_89T_mode_12.0,pipeline-3__dpdmaxdatemonth_89T_mode_2.0,pipeline-3__dpdmaxdatemonth_89T_mode_3.0,pipeline-3__dpdmaxdatemonth_89T_mode_4.0,pipeline-3__dpdmaxdatemonth_89T_mode_5.0,pipeline-3__dpdmaxdatemonth_89T_mode_6.0,pipeline-3__dpdmaxdatemonth_89T_mode_7.0,pipeline-3__dpdmaxdatemonth_89T_mode_8.0,pipeline-3__dpdmaxdatemonth_89T_mode_9.0,pipeline-3__dpdmaxdatemonth_89T_mode_nan,pipeline-3__overdueamountmaxdatemonth_284T_mode_1.0,pipeline-3__overdueamountmaxdatemonth_284T_mode_10.0,pipeline-3__overdueamountmaxdatemonth_284T_mode_11.0,pipeline-3__overdueamountmaxdatemonth_284T_mode_12.0,pipeline-3__overdueamountmaxdatemonth_284T_mode_2.0,pipeline-3__overdueamountmaxdatemonth_284T_mode_3.0,pipeline-3__overdueamountmaxdatemonth_284T_mode_4.0,pipeline-3__overdueamountmaxdatemonth_284T_mode_5.0,pipeline-3__overdueamountmaxdatemonth_284T_mode_6.0,pipeline-3__overdueamountmaxdatemonth_284T_mode_7.0,pipeline-3__overdueamountmaxdatemonth_284T_mode_8.0,pipeline-3__overdueamountmaxdatemonth_284T_mode_9.0,pipeline-3__overdueamountmaxdatemonth_284T_mode_nan,pipeline-3__overdueamountmaxdatemonth_365T_mode_1.0,pipeline-3__overdueamountmaxdatemonth_365T_mode_10.0,pipeline-3__overdueamountmaxdatemonth_365T_mode_11.0,pipeline-3__overdueamountmaxdatemonth_365T_mode_12.0,pipeline-3__overdueamountmaxdatemonth_365T_mode_2.0,pipeline-3__overdueamountmaxdatemonth_365T_mode_3.0,pipeline-3__overdueamountmaxdatemonth_365T_mode_4.0,pipeline-3__overdueamountmaxdatemonth_365T_mode_5.0,pipeline-3__overdueamountmaxdatemonth_365T_mode_6.0,pipeline-3__overdueamountmaxdatemonth_365T_mode_7.0,pipeline-3__overdueamountmaxdatemonth_365T_mode_8.0,pipeline-3__overdueamountmaxdatemonth_365T_mode_9.0,pipeline-3__overdueamountmaxdatemonth_365T_mode_nan,pipeline-3__credacc_status_367L_mode_AC,pipeline-3__credacc_status_367L_mode_CL,pipeline-3__credacc_status_367L_mode_nan,pipeline-3__credacc_status_367L_mode_infrequent_sklearn,pipeline-3__credtype_587L_mode_CAL,pipeline-3__credtype_587L_mode_COL,pipeline-3__credtype_587L_mode_REL,pipeline-3__credtype_587L_mode_nan,pipeline-3__education_1138M_mode_P33_146_175,pipeline-3__education_1138M_mode_P97_36_170,pipeline-3__education_1138M_mode_a55475b1,pipeline-3__education_1138M_mode_nan,pipeline-3__education_1138M_mode_infrequent_sklearn,pipeline-3__familystate_726L_mode_MARRIED,pipeline-3__familystate_726L_mode_SINGLE,pipeline-3__familystate_726L_mode_WIDOWED,pipeline-3__familystate_726L_mode_infrequent,pipeline-3__familystate_726L_mode_nan,pipeline-3__inittransactioncode_279L_mode_CASH,pipeline-3__inittransactioncode_279L_mode_POS,pipeline-3__inittransactioncode_279L_mode_nan,pipeline-3__inittransactioncode_279L_mode_infrequent_sklearn,pipeline-3__isdebitcard_527L_mode_False,pipeline-3__isdebitcard_527L_mode_True,pipeline-3__isdebitcard_527L_mode_nan,pipeline-3__postype_4733339M_mode_P149_40_170,pipeline-3__postype_4733339M_mode_P177_117_192,pipeline-3__postype_4733339M_mode_P46_145_78,pipeline-3__postype_4733339M_mode_a55475b1,pipeline-3__postype_4733339M_mode_infrequent,pipeline-3__postype_4733339M_mode_nan,pipeline-3__rejectreasonclient_4145042M_mode_P94_109_143,pipeline-3__rejectreasonclient_4145042M_mode_a55475b1,pipeline-3__rejectreasonclient_4145042M_mode_nan,pipeline-3__rejectreasonclient_4145042M_mode_infrequent_sklearn,pipeline-3__status_219L_mode_A,pipeline-3__status_219L_mode_D,pipeline-3__status_219L_mode_K,pipeline-3__status_219L_mode_T,pipeline-3__status_219L_mode_nan,pipeline-3__status_219L_mode_infrequent_sklearn,pipeline-3__education_927M_mode_P97_36_170,pipeline-3__education_927M_mode_a55475b1,pipeline-3__education_927M_mode_infrequent,pipeline-3__empl_employedtotal_800L_mode_MORE_FIVE,pipeline-3__empl_employedtotal_800L_mode_MORE_ONE,pipeline-3__empl_employedtotal_800L_mode_nan,pipeline-3__empl_employedtotal_800L_mode_infrequent_sklearn,pipeline-3__familystate_447L_mode_MARRIED,pipeline-3__familystate_447L_mode_SINGLE,pipeline-3__familystate_447L_mode_WIDOWED,pipeline-3__familystate_447L_mode_nan,pipeline-3__familystate_447L_mode_infrequent_sklearn,pipeline-3__housetype_905L_mode_OWNED,pipeline-3__housetype_905L_mode_nan,pipeline-3__housetype_905L_mode_infrequent_sklearn,pipeline-3__incometype_1044T_mode_EMPLOYED,pipeline-3__incometype_1044T_mode_PRIVATE_SECTOR_EMPLOYEE,pipeline-3__incometype_1044T_mode_RETIRED_PENSIONER,pipeline-3__incometype_1044T_mode_SALARIED_GOVT,pipeline-3__incometype_1044T_mode_infrequent,pipeline-3__language1_981M_mode_P10_39_147,pipeline-3__language1_981M_mode_P209_127_106,pipeline-3__language1_981M_mode_a55475b1,pipeline-3__personindex_1023L_mode_0.0,pipeline-3__personindex_1023L_mode_1.0,pipeline-3__personindex_1023L_mode_2.0,pipeline-3__personindex_1023L_mode_infrequent_sklearn,pipeline-3__persontype_1072L_mode_1.0,pipeline-3__persontype_1072L_mode_5.0,pipeline-3__persontype_1072L_mode_infrequent_sklearn,pipeline-3__persontype_792L_mode_1.0,pipeline-3__persontype_792L_mode_5.0,pipeline-3__persontype_792L_mode_infrequent,pipeline-3__relationshiptoclient_415T_mode_CHILD,pipeline-3__relationshiptoclient_415T_mode_COLLEAGUE,pipeline-3__relationshiptoclient_415T_mode_FRIEND,pipeline-3__relationshiptoclient_415T_mode_OTHER_RELATIVE,pipeline-3__relationshiptoclient_415T_mode_PARENT,pipeline-3__relationshiptoclient_415T_mode_SIBLING,pipeline-3__relationshiptoclient_415T_mode_SPOUSE,pipeline-3__relationshiptoclient_415T_mode_nan,pipeline-3__relationshiptoclient_415T_mode_infrequent_sklearn,pipeline-3__relationshiptoclient_642T_mode_CHILD,pipeline-3__relationshiptoclient_642T_mode_COLLEAGUE,pipeline-3__relationshiptoclient_642T_mode_FRIEND,pipeline-3__relationshiptoclient_642T_mode_OTHER_RELATIVE,pipeline-3__relationshiptoclient_642T_mode_PARENT,pipeline-3__relationshiptoclient_642T_mode_SIBLING,pipeline-3__relationshiptoclient_642T_mode_SPOUSE,pipeline-3__relationshiptoclient_642T_mode_nan,pipeline-3__relationshiptoclient_642T_mode_infrequent_sklearn,pipeline-3__role_1084L_mode_CL,pipeline-3__role_1084L_mode_EM,pipeline-3__role_1084L_mode_PE,pipeline-3__safeguarantyflag_411L_mode_False,pipeline-3__safeguarantyflag_411L_mode_True,pipeline-3__safeguarantyflag_411L_mode_infrequent_sklearn,pipeline-3__sex_738L_mode_M,pipeline-3__type_25L_mode_PHONE,pipeline-3__type_25L_mode_PRIMARY_MOBILE,pipeline-3__type_25L_mode_infrequent,pipeline-3__pmts_month_158T_mode_mode_1.0,pipeline-3__pmts_month_158T_mode_mode_10.0,pipeline-3__pmts_month_158T_mode_mode_11.0,pipeline-3__pmts_month_158T_mode_mode_12.0,pipeline-3__pmts_month_158T_mode_mode_2.0,pipeline-3__pmts_month_158T_mode_mode_3.0,pipeline-3__pmts_month_158T_mode_mode_4.0,pipeline-3__pmts_month_158T_mode_mode_5.0,pipeline-3__pmts_month_158T_mode_mode_6.0,pipeline-3__pmts_month_158T_mode_mode_7.0,pipeline-3__pmts_month_158T_mode_mode_8.0,pipeline-3__pmts_month_158T_mode_mode_9.0,pipeline-3__pmts_month_158T_mode_mode_nan,pipeline-3__pmts_month_706T_mode_mode_1.0,pipeline-3__pmts_month_706T_mode_mode_10.0,pipeline-3__pmts_month_706T_mode_mode_11.0,pipeline-3__pmts_month_706T_mode_mode_12.0,pipeline-3__pmts_month_706T_mode_mode_2.0,pipeline-3__pmts_month_706T_mode_mode_3.0,pipeline-3__pmts_month_706T_mode_mode_4.0,pipeline-3__pmts_month_706T_mode_mode_5.0,pipeline-3__pmts_month_706T_mode_mode_6.0,pipeline-3__pmts_month_706T_mode_mode_7.0,pipeline-3__pmts_month_706T_mode_mode_8.0,pipeline-3__pmts_month_706T_mode_mode_9.0,pipeline-3__pmts_month_706T_mode_mode_nan,pipeline-3__conts_type_509L_mode_mode_HOME_PHONE,pipeline-3__conts_type_509L_mode_mode_PHONE,pipeline-3__conts_type_509L_mode_mode_PRIMARY_MOBILE,pipeline-3__conts_type_509L_mode_mode_infrequent,pipeline-3__conts_type_509L_mode_mode_nan,pipeline-3__credacc_cards_status_52L_mode_mode_ACTIVE,pipeline-3__credacc_cards_status_52L_mode_mode_CANCELLED,pipeline-3__credacc_cards_status_52L_mode_mode_INACTIVE,pipeline-3__credacc_cards_status_52L_mode_mode_nan,pipeline-3__credacc_cards_status_52L_mode_mode_infrequent_sklearn,pipeline-3__decision_quarter_1,pipeline-3__decision_quarter_2,pipeline-3__decision_quarter_3,pipeline-3__decision_quarter_4,pipeline-3__decision_month_of_year_1,pipeline-3__decision_month_of_year_10,pipeline-3__decision_month_of_year_11,pipeline-3__decision_month_of_year_12,pipeline-3__decision_month_of_year_2,pipeline-3__decision_month_of_year_3,pipeline-3__decision_month_of_year_4,pipeline-3__decision_month_of_year_5,pipeline-3__decision_month_of_year_6,pipeline-3__decision_month_of_year_7,pipeline-3__decision_month_of_year_8,pipeline-3__decision_month_of_year_9,pipeline-3__decision_day_of_week_1,pipeline-3__decision_day_of_week_2,pipeline-3__decision_day_of_week_3,pipeline-3__decision_day_of_week_4,pipeline-3__decision_day_of_week_5,pipeline-3__decision_day_of_week_6,pipeline-3__decision_day_of_week_7,pipeline-4__lastapprcommoditycat_1041M,pipeline-4__lastcancelreason_561M,pipeline-4__lastrejectcommoditycat_161M,pipeline-4__lastrejectreason_759M,pipeline-4__lastrejectreasonclient_4145040M,pipeline-4__classificationofcontr_400M_mode,pipeline-4__contractst_964M_mode,pipeline-4__financialinstitution_382M_mode,pipeline-4__purposeofcred_874M_mode,pipeline-4__cancelreason_3545846M_mode,pipeline-4__rejectreason_755M_mode,pipeline-4__empl_industry_691L_mode,pipeline-5__previouscontdistrict_112M,pipeline-5__district_544M_mode,pipeline-5__contaddr_district_15M_mode,pipeline-5__contaddr_zipcode_807M_mode,pipeline-5__registaddr_district_1083M_mode,pipeline-5__registaddr_zipcode_184M_mode,pipeline-5__name_4527232M_mode,pipeline-5__name_4917606M_mode,pipeline-5__employername_160M_mode
0,,,,,,,,,,,,,,,,,,,,,,,,,,-0.936974,,,-0.831281,-2.2008,-1.256772e-13,-2.191088,,-0.761955,-0.663781,-0.342073,-0.235207,-1.227693,,,,,,-0.310055,-0.276172,-0.342688,,,-0.508981,,-0.251368,0.971612,-0.891713,,,,-1.935353,-1.647592,,-0.686637,-0.371443,-0.463003,-0.928267,,,,,-0.856432,-0.748553,-0.323368,-0.541199,-0.469944,-0.68233,,,,,,-0.473855,,,,0.906141,,-0.396137,-1.141163,-0.70343,-1.421679,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,-2.670031,,,,,,,,-1.579505,,,,-0.785544,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,7.0,6.0,4.0,4.0,1.0,4.0,3.0,7.0,5.0,3.0,4.0,5.0,-0.177953,-1.260243,0.431777,0.359372,0.425454,0.275276,0.11806,0.154421,0.250446
1,,,,,,,,,,,,,,,,,,,,,,,,,,0.945824,,,1.166738,-2.200753,-1.256772e-13,-2.191088,,-0.027456,-0.663781,-0.342073,-0.235207,-1.227693,,,,,,-0.310055,-0.276172,-0.342688,,,-0.508981,,-0.251368,-0.170556,-0.891713,,,,-1.935353,-1.647592,,-0.686637,-0.371443,-0.463003,-0.928267,,,,,-0.856432,-0.748553,-0.323368,-0.541199,-0.469944,-0.68233,,,,,,-0.473855,,,,0.387084,,-0.396137,-1.141163,-0.70343,-1.421679,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,-2.795795,,,,,,,,-1.579505,,,,1.202269,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,7.0,6.0,4.0,4.0,1.0,4.0,3.0,7.0,5.0,3.0,4.0,5.0,-0.188284,-1.272592,0.437376,0.35319,0.426409,0.268246,0.117341,0.15139,0.258625
2,,,,,1.763522,,1.264124,,,,,,,,,,,,,,1.137092,,0.547894,0.099392,,0.800643,,,0.078988,-2.200712,-1.256772e-13,-2.191088,,0.641916,-0.663781,-0.342073,-0.235207,-1.227693,,,,,,-0.310055,-0.276172,-0.342688,,,-0.508981,,-0.251368,0.971612,0.893652,,-0.872931,,-1.935353,-1.647592,,-0.686637,-0.371443,-0.463003,-0.928267,,,,,0.712513,-0.748553,-0.323368,-0.541199,-0.469944,-0.68233,,,,,,-0.473855,,,,1.615184,,-0.396137,-1.141163,-0.70343,-1.421679,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,-1.232932,,-1.104205,,-0.527807,,,,-1.197669,,-0.656158,-2.173772,,0.551909,,,,,-2.24071,,,,,,,,-1.424733,,,,0.144914,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,7.0,6.0,4.0,4.0,1.0,4.0,3.0,7.0,5.0,1.0,2.0,5.0,-0.226416,-0.340693,0.409108,0.34393,0.398528,0.2539,0.11466,0.150081,0.259475
3,,,,,-1.985578,,-1.989415,,,,,,,,,,,,,,-2.66159,,-0.68845,-2.704931,,-1.452956,,,-1.508058,-2.200675,-1.256772e-13,-2.191088,,0.552236,-0.663781,2.923167,-0.235207,-0.216447,,,,,,-0.310055,-0.276172,-0.342688,,,-0.508981,,-0.251368,0.704782,-0.891713,,0.493008,,-1.935353,-1.647592,,-0.686637,-0.371443,-0.463003,-0.928267,,,,,-0.856432,-0.748553,-0.323368,-0.541199,2.118329,-0.68233,,,,,,2.083415,,,,-0.362963,,2.523727,0.0847,-0.70343,-1.421679,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.320559,,,,-0.527807,,,,0.982426,,-0.656158,-1.821339,,-0.279624,,,,,-2.795795,,,,,,,,-1.579505,,,,-1.442973,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,7.0,5.0,4.0,2.0,1.0,4.0,3.0,7.0,5.0,0.0,0.0,5.0,-0.177953,0.261858,0.431777,0.359372,0.425454,0.275276,0.11806,0.154421,0.250446
4,,,,,-1.985578,,,,,,,,,,,,,,,,-2.66159,,,,,0.024278,,,-1.551754,-2.20064,-1.256772e-13,-2.191088,,0.088979,-0.663781,2.923167,-0.235207,-0.216447,,,,,,-0.310055,-0.276172,-0.342688,,,-0.508981,,-0.251368,0.971612,0.893652,,,,-1.935353,-1.647592,,-0.686637,-0.371443,-0.463003,-0.928267,,,,,-0.856432,-0.748553,-0.323368,-0.541199,-0.469944,-0.68233,,,,,,-0.473855,,,,0.906141,,-0.396137,-1.141163,-0.70343,-1.421679,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,-0.28586,,,,-0.527807,,,,0.300237,,-0.656158,-1.326439,,1.208322,,,,,-1.322599,,,,,,,,-1.424733,,,,-1.546534,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,7.0,2.0,4.0,4.0,1.0,4.0,3.0,7.0,5.0,2.0,2.0,5.0,-0.188284,0.641366,0.437376,0.35319,0.426409,0.268246,0.117341,0.15139,0.258625


In [5]:
X = X.merge(X4, left_index=True, right_index=True, how='left')

In [5]:
X.replace([np.inf, -np.inf], np.nan, inplace=True)

In [6]:
X = X.fillna(0)

In [25]:
X = (X-X.mean())/X.std()

In [28]:
X = X.fillna(0)

In [79]:
train_data = lgb.Dataset(X, label=y)
eval_result = {}
res = lgb.cv(params, train_data, callbacks=[lgb.log_evaluation(200), lgb.early_stopping(10), lgb.record_evaluation(eval_result)], return_cvbooster=True)



Training until validation scores don't improve for 10 rounds
[200]	cv_agg's valid auc: 0.858566 + 0.00116173
[400]	cv_agg's valid auc: 0.863615 + 0.00119205
[600]	cv_agg's valid auc: 0.865231 + 0.00120732
[800]	cv_agg's valid auc: 0.865771 + 0.00121943
Early stopping, best iteration is:
[841]	cv_agg's valid auc: 0.865834 + 0.00124989


In [29]:
sbs = SelectByShuffling(
        LogisticRegression(),
        cv=3,
        random_state=42,
      )
sbs.fit(X, y)
sbs.get_feature_names_out()

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


['birthdate_574D',
 'days120_123L',
 'days180_256L',
 'days30_165L',
 'days360_512L',
 'days90_310L',
 'description_5085714M',
 'education_1103M',
 'firstquarter_103L',
 'numberofqueries_373L',
 'pmtaverage_3A',
 'pmtaverage_4527227A',
 'pmtssum_45A',
 'requesttype_4525192L',
 'secondquarter_766L',
 'amtinstpaidbefduel24m_4187115A',
 'annuity_780A',
 'avgdbdtollast24m_4525197P',
 'avgdpdtolclosure24_3658938P',
 'avgmaxdpdlast9m_3716943P',
 'cntincpaycont9m_3716944L',
 'cntpmts24_3658933L',
 'credamount_770A',
 'credtype_322L',
 'datefirstoffer_1144D',
 'datelastinstal40dpd_247D',
 'firstdatedue_489D',
 'isbidproduct_1095L',
 'lastactivateddate_801D',
 'lastapprcommoditycat_1041M',
 'lastcancelreason_561M',
 'lastdelinqdate_224D',
 'lastrejectreason_759M',
 'lastrejectreasonclient_4145040M',
 'lastst_736L',
 'maxdbddpdtollast6m_4187119P',
 'maxdebt4_972A',
 'maxdpdinstldate_3546855D',
 'maxdpdtolerance_374P',
 'maxinstallast24m_3658928A',
 'mobilephncnt_593L',
 'monthsannuity_845L',
 'n

In [None]:
rfa = RecursiveFeatureAddition(RandomForestClassifier(random_state=42), cv=3)
rfa.fit(X, y)
rfa.get_feature_names_out()

In [6]:
import gc
del X4
gc.collect()

1334

In [9]:
psi = DropHighPSIFeatures()
psi.fit(X)
psi.get_feature_names_out()

['month_decision',
 'weekday_decision',
 'assignmentdate_238D',
 'assignmentdate_4527235D',
 'birthdate_574D',
 'contractssum_5085716L',
 'dateofbirth_337D',
 'days120_123L',
 'days180_256L',
 'days30_165L',
 'days360_512L',
 'days90_310L',
 'description_5085714M',
 'education_1103M',
 'education_88M',
 'firstquarter_103L',
 'fourthquarter_440L',
 'maritalst_385M',
 'maritalst_893M',
 'numberofqueries_373L',
 'pmtaverage_3A',
 'pmtaverage_4527227A',
 'pmtcount_4527229L',
 'pmtcount_693L',
 'pmtscount_423L',
 'pmtssum_45A',
 'requesttype_4525192L',
 'responsedate_1012D',
 'responsedate_4527233D',
 'responsedate_4917613D',
 'secondquarter_766L',
 'thirdquarter_1082L',
 'actualdpdtolerance_344P',
 'annuity_780A',
 'annuitynextmonth_57A',
 'applicationcnt_361L',
 'applications30d_658L',
 'applicationscnt_1086L',
 'applicationscnt_464L',
 'applicationscnt_629L',
 'avgdbddpdlast3m_4187120P',
 'avgdpdtolclosure24_3658938P',
 'avglnamtstart24m_4525187A',
 'avgmaxdpdlast9m_3716943P',
 'avgpmtla

In [8]:
iv = SelectByInformationValue(strategy='equal_frequency', bins=5)
iv.fit(X,y )
iv.get_feature_names_out()

ValueError: The proportion of one of the classes for a category in variable max_num_group2_13 is zero, and log of zero is not defined

In [11]:
X['assignmentdate_238D'].describe()

count    1.526537e+06
mean    -1.896536e+03
std      7.095516e+02
min     -1.644800e+04
25%     -1.850000e+03
50%     -1.850000e+03
75%     -1.850000e+03
max      1.400000e+01
Name: assignmentdate_238D, dtype: float64

In [7]:
sel = ProbeFeatureSelection(
    estimator=LogisticRegression(),
    scoring="roc_auc",
    n_probes=3,
    distribution="normal",
    cv=3,
    random_state=150,
)
sel.fit(X, y)
sel.get_feature_names_out()

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

['month_decision',
 'weekday_decision',
 'assignmentdate_238D',
 'assignmentdate_4527235D',
 'birthdate_574D',
 'contractssum_5085716L',
 'dateofbirth_337D',
 'days120_123L',
 'days180_256L',
 'days30_165L',
 'days360_512L',
 'days90_310L',
 'education_1103M',
 'firstquarter_103L',
 'fourthquarter_440L',
 'numberofqueries_373L',
 'pmtaverage_3A',
 'pmtaverage_4527227A',
 'pmtcount_4527229L',
 'pmtcount_693L',
 'pmtscount_423L',
 'pmtssum_45A',
 'requesttype_4525192L',
 'responsedate_1012D',
 'responsedate_4527233D',
 'responsedate_4917613D',
 'secondquarter_766L',
 'thirdquarter_1082L',
 'actualdpdtolerance_344P',
 'amtinstpaidbefduel24m_4187115A',
 'annuity_780A',
 'annuitynextmonth_57A',
 'applications30d_658L',
 'applicationscnt_1086L',
 'applicationscnt_464L',
 'applicationscnt_629L',
 'applicationscnt_867L',
 'avgdbddpdlast24m_3658932P',
 'avgdbddpdlast3m_4187120P',
 'avgdbdtollast24m_4525197P',
 'avgdpdtolclosure24_3658938P',
 'avginstallast24m_3658937A',
 'avglnamtstart24m_45251

In [8]:
sfp = SelectBySingleFeaturePerformance(LogisticRegression(), cv=3)
sfp.fit(X, y)
sfp.get_feature_names_out()

['weekday_decision',
 'birthdate_574D',
 'contractssum_5085716L',
 'dateofbirth_337D',
 'days120_123L',
 'days180_256L',
 'days30_165L',
 'days360_512L',
 'days90_310L',
 'description_5085714M',
 'education_1103M',
 'firstquarter_103L',
 'fourthquarter_440L',
 'maritalst_385M',
 'numberofqueries_373L',
 'pmtscount_423L',
 'pmtssum_45A',
 'requesttype_4525192L',
 'secondquarter_766L',
 'thirdquarter_1082L',
 'amtinstpaidbefduel24m_4187115A',
 'annuity_780A',
 'applications30d_658L',
 'avgdbddpdlast24m_3658932P',
 'avgdbddpdlast3m_4187120P',
 'avgdbdtollast24m_4525197P',
 'avgdpdtolclosure24_3658938P',
 'avglnamtstart24m_4525187A',
 'avgmaxdpdlast9m_3716943P',
 'avgoutstandbalancel6m_4187114A',
 'cntincpaycont9m_3716944L',
 'cntpmts24_3658933L',
 'credamount_770A',
 'currdebt_22A',
 'datefirstoffer_1144D',
 'datelastinstal40dpd_247D',
 'daysoverduetolerancedd_3976961L',
 'disbursedcredamount_1113A',
 'eir_270L',
 'firstclxcampaign_1125D',
 'firstdatedue_489D',
 'homephncnt_628L',
 'inter

In [None]:
tmp = SelectByTargetMeanPerformance(bins = 3, cv=3,strategy='equal_frequency')
tmp.fit(X, y)
tmp.get_feature_names_out()

In [7]:
def reduce_group(grps):
    """picks the features with the largest number of unique values from each group"""
    use = []
    for g in grps:
        mx = 0; vx = g[0]
        for gg in g:
            n = X[gg].nunique()
            if n>mx:
                mx = n
                vx = gg
        use.append(vx)
        #print()
    print('Use these',use)
    return use

def group_columns_by_correlation(matrix, threshold=0.8):
    """returns groups of correlated features"""
    correlation_matrix = matrix.corr()
    groups = []
    remaining_cols = list(matrix.columns)
    while remaining_cols:
        col = remaining_cols.pop(0)
        group = [col]
        correlated_cols = [col]
        for c in remaining_cols:
            if correlation_matrix.loc[col, c] >= threshold:
                group.append(c)
                correlated_cols.append(c)
        groups.append(group)
        remaining_cols = [c for c in remaining_cols if c not in correlated_cols]
    
    return groups

def drop_corr_keep_best(X):
    nans_df = X[num_cols_base].isna()
    nans_groups={}
    for col in num_cols_base:
        cur_group = nans_df[col].sum()
        try:
            nans_groups[cur_group].append(col)
        except:
            nans_groups[cur_group]=[col]
    
    uses=[]
    for k,v in nans_groups.items():
        if len(v)>1:
                Vs = nans_groups[k]
                #cross_features=list(combinations(Vs, 2))
                #make_corr(Vs)
                grps= group_columns_by_correlation(X[Vs], threshold=0.8)
                use=reduce_group(grps)
                uses=uses+use
                #make_corr(use)
        else:
            uses=uses+v
        print('####### NAN count =',k)
    print(uses)
    print(len(uses))
    uses=uses+cat_cols_base
    print(len(uses))
    return X[uses]

In [8]:
cat_cols_base = list(X.select_dtypes("category").columns)
num_cols_base = list(X.select_dtypes(exclude="category").columns)

In [10]:
print(X.shape)
corr_matrix = np.corrcoef(X.values, rowvar=False)
corr_matrix.shape

(1526537, 969)


  c /= stddev[:, None]
  c /= stddev[None, :]


(969, 969)

In [15]:
np.where(corr_matrix > 0.9)

(array([  0,   1,   2, ..., 966, 967, 968], dtype=int64),
 array([  0,   1,   2, ..., 966, 967, 968], dtype=int64))

In [19]:
cols_to_drop = set()
print('corr computed')
for i, col1 in enumerate(X.columns):
    for j, col2 in enumerate(X.columns):
        if i != j:
            # Check for high correlation
            if abs(corr_matrix[i, j]) >= 0.90:
                # Check which column has more missing values
                cols_to_drop.add(col1)
len(cols_to_drop)

corr computed


355

In [9]:
params = {
    "boosting_type": "gbdt",
    "objective": "binary",
    "metric": "auc",
    "max_depth": 11,
    "num_leaves": 144,
    "learning_rate": 0.05,
    "feature_fraction": 0.54,
    "bagging_fraction": 0.8,
    "bagging_freq": 5,
    "n_estimators": 1000,
    "verbose": -1,
    "min_child_weight": 1.8,
    "reg_alpha": 5,
    "reg_lambda": 100,
    "colsample_bytree": 0.633,
    "feature_pre_filter": False,
    "min_child_samples": 237,
    "subsample": 0.52,
}

In [10]:
train_data = lgb.Dataset(X, label=y)
eval_result = {}
res = lgb.cv(params, train_data, callbacks=[lgb.log_evaluation(200), lgb.early_stopping(10), lgb.record_evaluation(eval_result)], return_cvbooster=True)



Training until validation scores don't improve for 10 rounds
[200]	cv_agg's valid auc: 0.857257 + 0.00111501
[400]	cv_agg's valid auc: 0.861676 + 0.000912863
[600]	cv_agg's valid auc: 0.86317 + 0.00062404
[800]	cv_agg's valid auc: 0.863745 + 0.000616252
Early stopping, best iteration is:
[790]	cv_agg's valid auc: 0.863752 + 0.000624731


In [None]:
X = drop_corr_keep_best(X)
print(X.shape)

train_data = lgb.Dataset(X, label=y)
eval_result = {}
res = lgb.cv(params, train_data, callbacks=[lgb.log_evaluation(200), lgb.early_stopping(10), lgb.record_evaluation(eval_result)], return_cvbooster=True)

Use these ['month_decision', 'weekday_decision', 'credamount_770A', 'applicationcnt_361L', 'applications30d_658L', 'applicationscnt_1086L', 'applicationscnt_464L', 'applicationscnt_867L', 'clientscnt_1022L', 'clientscnt_100L', 'clientscnt_1071L', 'clientscnt_1130L', 'clientscnt_157L', 'clientscnt_257L', 'clientscnt_304L', 'clientscnt_360L', 'clientscnt_493L', 'clientscnt_533L', 'clientscnt_887L', 'clientscnt_946L', 'deferredmnthsnum_166L', 'disbursedcredamount_1113A', 'downpmt_116A', 'homephncnt_628L', 'isbidproduct_1095L', 'mobilephncnt_593L', 'numactivecreds_622L', 'numactivecredschannel_414L', 'numactiverelcontr_750L', 'numcontrs3months_479L', 'numnotactivated_1143L', 'numpmtchanneldd_318L', 'numrejects9m_859L', 'sellerplacecnt_915L', 'max_mainoccupationinc_384A', 'max_birth_259D', 'max_num_group1_9', 'pipeline-1__delta_date_decision_birth_259D_mean', 'pipeline-2__case_id', 'pipeline-2__MONTH', 'pipeline-2__WEEK_NUM', 'pipeline-2__applicationscnt_1086L', 'pipeline-2__applicationscnt



Training until validation scores don't improve for 10 rounds
[200]	cv_agg's valid auc: 0.856703 + 0.00107762


In [None]:
X.columns

In [23]:
cols_to_drop

{'actualdpdtolerance_344P',
 'applicationscnt_464L',
 'applicationscnt_629L',
 'avgdbddpdlast24m_3658932P',
 'avgdbddpdlast3m_4187120P',
 'avgdbdtollast24m_4525197P',
 'avgdpdtolclosure24_3658938P',
 'avgoutstandbalancel6m_4187114A',
 'clientscnt12m_3712952L',
 'clientscnt3m_3712950L',
 'clientscnt6m_3712949L',
 'clientscnt_1022L',
 'contractssum_5085716L',
 'credamount_770A',
 'currdebt_22A',
 'dateofbirth_337D',
 'days120_123L',
 'days180_256L',
 'days360_512L',
 'days90_310L',
 'description_5085714M',
 'disbursedcredamount_1113A',
 'disbursementtype_67L',
 'education_88M',
 'eir_270L',
 'inittransactioncode_186L',
 'interestrate_311L',
 'isbidproduct_1095L',
 'last_actualdpd_943P',
 'last_amount_416A',
 'last_amount_4527230A',
 'last_amount_4917619A',
 'last_annuity_853A',
 'last_approvaldate_319D',
 'last_cacccardblochreas_147M',
 'last_cancelreason_3545846M',
 'last_classificationofcontr_13M',
 'last_classificationofcontr_400M',
 'last_collater_typofvalofguarant_298M',
 'last_coll