In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import StratifiedKFold, GroupKFold, KFold
from sklearn.metrics import log_loss
from sklearn.preprocessing import OneHotEncoder

from lightgbm import LGBMClassifier
from xgboost import XGBClassifier

import matplotlib.pyplot as plt
import seaborn as sns

from tqdm.notebook import tqdm

In [2]:
# 1) credit 열은 해당자가 그 행에 대응되는 신용카드를 발급받은 이후 데이터 수집일까지의 대금 연체 정도로 평가한 값인가요?
!unzip /content/drive/MyDrive/old_dacon/dacon_신용/open.zip

Archive:  /content/drive/MyDrive/old_dacon/dacon_신용/open.zip
   creating: open/
  inflating: open/train.csv          
  inflating: open/sample_submission.csv  
  inflating: open/test.csv           


In [6]:
# df_train = pd.read_csv('open/train.csv')
# df_test = pd.read_csv('open/test.csv')
# sub = pd.read_csv('open/sample_submission.csv')

# # df_train = df_train.sort_values('begin_month', ascending=True).reset_index(drop=True)
# # df_test = df_test.sort_values('begin_month', ascending=True).reset_index(drop=True)

# df_train.loc[df_train['child_num'] > df_train['family_size'], 'family_size'] = 4
# df_train.loc[(df_train['family_type']=='Married') & (df_train['family_size']<2), 'family_size'] = 3
# df_train.loc[df_train['child_num'] == df_train['family_size'], 'family_size'] = 2

# df_test.loc[df_test['child_num'] > df_test['family_size'], 'family_size'] = 4
# df_test.loc[(df_test['family_type']=='Married') & (df_test['family_size']<2), 'family_size'] = 3

# df_train['uid_rows'] = ['_'.join(df_train.loc[idx, 'gender':'family_size'].astype(str).tolist()) for idx in tqdm(range(len(df_train)))]
# df_test['uid_rows'] = ['_'.join(df_test.loc[idx, 'gender':'family_size'].astype(str).tolist()) for idx in tqdm(range(len(df_test)))]

# df_train = pd.concat([df_train, df_train.groupby('uid_rows')[['index']].rank().rename(columns={'index':'rank'})], 1)
# df_test['rank'] = 0

train = df_train.copy()
test = df_test.copy()

# 부양가족
train['n_dependents2'] = train['family_size'] - train['child_num']
test['n_dependents2'] = test['family_size'] - test['child_num']

TARGET = 'credit'

for col in ['gender', 'car', 'reality', 'edu_type', 'house_type', 'occyp_type', 'income_type', 'family_type', 'work_phone', 'phone', 'email']:
    temp = train[col].value_counts(True).to_dict()
    for df in [train, test]:
        df[col] = df[col].map(temp)

train['DAYS_BIRTH_DATE'] = train['DAYS_BIRTH'].abs().map({k:v for k,v in enumerate(pd.date_range(end='2019-12-31', periods=25153, freq='d')[::-1])})
test['DAYS_BIRTH_DATE'] = test['DAYS_BIRTH'].abs().map({k:v for k,v in enumerate(pd.date_range(end='2019-12-31', periods=25153, freq='d')[::-1])})

# 미고용 상태 NULL
train['DAYS_EMPLOYED_DATE'] = train['DAYS_EMPLOYED'].abs().map({k:v for k,v in enumerate(pd.date_range(end='2019-12-31', periods=15714, freq='d')[::-1])})
test['DAYS_EMPLOYED_DATE'] = test['DAYS_EMPLOYED'].abs().map({k:v for k,v in enumerate(pd.date_range(end='2019-12-31', periods=15714, freq='d')[::-1])})
train['DAYS_EMPLOYED_DATE'] = pd.to_datetime(train['DAYS_EMPLOYED_DATE'].fillna('2020-01-01'))
test['DAYS_EMPLOYED_DATE'] = pd.to_datetime(test['DAYS_EMPLOYED_DATE'].fillna('2020-01-01'))


train['begin_month_DATE'] = train['begin_month'].abs().map({k:v for k,v in enumerate(pd.date_range(end='2019-12-31', periods=61, freq='m')[::-1])})
test['begin_month_DATE'] = test['begin_month'].abs().map({k:v for k,v in enumerate(pd.date_range(end='2019-12-31', periods=61, freq='m')[::-1])})

for df in [train, test]:

    # 2019년 기준 현재 나이, 정년까지 남은 연수
    df['age'] = 2019 - df['DAYS_BIRTH_DATE'].dt.year
    df['age2'] = 60 - df['age']
    df['income_total_y'] = df['income_total'] * (df['DAYS_EMPLOYED_DATE'].dt.year + df['DAYS_EMPLOYED_DATE'].dt.month)

    df['DAYS_BIRTH_month'] = df['DAYS_BIRTH_DATE'].dt.month
    df['DAYS_BIRTH_week'] = df['DAYS_BIRTH_DATE'].dt.isocalendar().week.astype(int)
    df['DAYS_BIRTH_day'] = df['DAYS_BIRTH_DATE'].dt.day
    df['DAYS_BIRTH_weekday'] = df['DAYS_BIRTH_DATE'].dt.weekday

    df['DAYS_EMPLOYED_DATE_month'] = df['DAYS_EMPLOYED_DATE'].dt.month
    df['DAYS_EMPLOYED_DATE_week'] = df['DAYS_EMPLOYED_DATE'].dt.isocalendar().week.astype(int)
    df['DAYS_EMPLOYED_DATE_day'] = df['DAYS_EMPLOYED_DATE'].dt.day
    df['DAYS_EMPLOYED_DATE_weekday'] = df['DAYS_EMPLOYED_DATE'].dt.weekday

    df['employment_days'] = (df['DAYS_BIRTH_DATE'] - df['DAYS_EMPLOYED_DATE']).dt.days
    df['employment_days'] = df['employment_days']//12

    df.loc[df['DAYS_EMPLOYED_DATE']=='2020-01-01', ['DAYS_EMPLOYED_DATE_month', 'DAYS_EMPLOYED_DATE_week', 'DAYS_EMPLOYED_DATE_day', 'DAYS_EMPLOYED_DATE_weekday', 'employment_days']] = -1

    df['begin_month_year'] = df['begin_month_DATE'].dt.year
    df['begin_month_month'] = df['begin_month_DATE'].dt.month
    
def create_features(train, test, uid, feature, aggs):
    tr, te = train.copy(), test.copy()
    
    if len(uid)==3:
        uid1, uid2, uid3 = uid[0], uid[1], uid[2]
        tr['uid'] = tr[uid1].astype(str) + '_' + tr[uid2].astype(str) + '_' + tr[uid3].astype(str)
        te['uid'] = te[uid1].astype(str) + '_' + te[uid2].astype(str) + '_' + te[uid3].astype(str)

        for agg in aggs:
            new_col = f'{uid1}_{uid2}_{uid3}_{feature}_{agg}'
            tr[new_col] = tr['uid'].map(tr.groupby('uid')[feature].agg(agg))
            te[new_col] = te['uid'].map(tr.groupby('uid')[feature].agg(agg))
    
    elif len(uid)==2:
        uid1, uid2 = uid[0], uid[1]
        tr['uid'] = tr[uid1].astype(str) + '_' + tr[uid2].astype(str)
        te['uid'] = te[uid1].astype(str) + '_' + te[uid2].astype(str)

        for agg in aggs:
            new_col = f'{uid1}_{uid2}_{feature}_{agg}'
            tr[new_col] = tr['uid'].map(tr.groupby('uid')[feature].agg(agg))
            te[new_col] = te['uid'].map(tr.groupby('uid')[feature].agg(agg))
    else:
        uid1 = uid[0]
        tr['uid'] = tr[uid1].astype(str) + '_'
        te['uid'] = te[uid1].astype(str) + '_'

        for agg in aggs:
            new_col = f'{uid1}__{feature}_{agg}'
            tr[new_col] = tr['uid'].map(tr.groupby('uid')[feature].agg(agg))
            te[new_col] = te['uid'].map(tr.groupby('uid')[feature].agg(agg))
    tr = tr.drop(columns='uid')
    te = te.drop(columns='uid')
    return tr, te

train, test = create_features(train, test, ['DAYS_BIRTH', 'DAYS_EMPLOYED'], 'begin_month', ['mean', 'std', 'max'])
train, test = create_features(train, test, ['DAYS_BIRTH', 'DAYS_EMPLOYED'], 'begin_month_year', ['mean', 'std'])
train, test = create_features(train, test, ['DAYS_BIRTH', 'DAYS_EMPLOYED'], 'begin_month_month', ['mean', 'std'])

train = train.fillna(-99)
test = test.fillna(-99)

from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder
from sklearn.cluster import KMeans, AgglomerativeClustering
for data in [train, test]:
    # data['EMPLOYED_BIRTH_RATIO'] = data['DAYS_EMPLOYED']/data['DAYS_BIRTH']
    # data['INCOME_EMPLOYED_RATIO'] = data['income_total']/data['DAYS_EMPLOYED']
    # data['INCOME_BIRTH_RATIO'] = data['income_total']/data['DAYS_BIRTH']
    data['personal_id'] = data['gender'].astype(str) + "_" + data['DAYS_BIRTH'].astype(str) + "_" + data['income_total'].astype(str) + "_" + data['income_type'].astype(str)

for col in ['personal_id']:
    temp = train[col].value_counts(True).to_dict()
    for df in [train, test]:
        df[col] = df[col].map(temp)

cluster_train = df_train.copy()
cluster_test = df_test.copy()
for col in ['gender', 'car', 'reality', 'edu_type', 'house_type', 'occyp_type', 'income_type', 'family_type', 'work_phone', 'phone', 'email']:
    cluster_train = pd.concat([cluster_train, pd.get_dummies(cluster_train[col], prefix=col)], 1)
    cluster_test = pd.concat([cluster_test, pd.get_dummies(cluster_test[col], prefix=col)], 1)
    cluster_train = cluster_train.drop(columns=col)
    cluster_test = cluster_test.drop(columns=col)

cluster_train.drop(columns='credit', inplace=True)
cluster_train.drop(columns=['index', 'rank', 'uid_rows'], inplace=True)
cluster_test.drop(columns=['index', 'rank', 'uid_rows'], inplace=True)

scaler = MinMaxScaler()
scaler.fit(cluster_train.values)

cluster_train = scaler.transform(cluster_train.values)
cluster_test = scaler.transform(cluster_test.values)

km = KMeans(n_clusters=100, random_state=42, max_iter=1000, )
# km = AgglomerativeClustering(n_clusters=100, )

km.fit(cluster_train)

km_train = km.predict(cluster_train)
km_test = km.predict(cluster_test)

train['cluster'] = km_train
test['cluster'] = km_test

drop_col = ['index', 'FLAG_MOBIL', 'DAYS_BIRTH_DATE', 'DAYS_EMPLOYED_DATE', 'begin_month_DATE', 'DAYS_EMPLOYED', 'DAYS_BIRTH',
            'begin_month_year', 'begin_month_month', 'uid_rows', 'rank', 'income_total_y',]

## rf, xgb

In [None]:
from sklearn.ensemble import RandomForestClassifier

FOLDS = 8
RANDOM_STATE = 0

gk = GroupKFold(n_splits=FOLDS)
oof_rf = np.zeros([len(train), 3])
pred_rf = np.zeros([len(test), 3])

features = [col for col in test.columns if col not in drop_col]

scaler = MinMaxScaler()
scaler.fit(train[features])

for idx, (trn_idx, val_idx) in enumerate(gk.split(train, groups=train['rank'])):
    X_train = scaler.transform(train[features])
    X_test = scaler.transform(test[features])
    y_train = train[TARGET].values

    clf = RandomForestClassifier(n_estimators=1000, max_depth=8,
                         random_state=RANDOM_STATE)
    clf.fit(X_train[trn_idx], y_train[trn_idx])

    oof_rf[val_idx] = clf.predict_proba(X_train[val_idx])
    pred_rf += clf.predict_proba(X_test) / FOLDS
    print(idx, 'fold complete ---\n')

log_loss(y_train, oof_rf)

0 fold complete ---

1 fold complete ---

2 fold complete ---

3 fold complete ---

4 fold complete ---

5 fold complete ---

6 fold complete ---

7 fold complete ---



0.7810939217101271

In [None]:
folds = 8
# skf = StratifiedKFold(n_splits=folds, random_state=0, shuffle=True)
# gk = GroupKFold(n_splits=folds)
oof_xgb = np.zeros([len(train), 3])
pred_xgb = np.zeros([len(test), 3])
features = [col for col in test.columns if col not in drop_col]

for idx, (trn_idx, val_idx) in enumerate(gk.split(train, groups=train['rank'])):
    X_train = train[features]
    X_test = test[features]
    y_train = train[TARGET].values
    
    clf = XGBClassifier(
                         objective='multi:softprob', 
                         num_class=3,
                         metrics='mlogloss',
                         n_estimators=10000,
                         max_depth=8,
                         learning_rate=0.03,
                         colsample_bytree=0.5,
                         subsample=0.7,
                         num_leaves=256,
                         reg_alpha=0.01,
                         reg_lambda=0.01,
                         random_state=0,
                        )
    
    evals = [(X_train.loc[trn_idx], y_train[trn_idx]), (X_train.loc[val_idx], y_train[val_idx])]
    clf.fit(X_train.loc[trn_idx], y_train[trn_idx], eval_set=evals, eval_metric='mlogloss', early_stopping_rounds=50, verbose=1000)

    oof_xgb[val_idx] = clf.predict_proba(X_train.loc[val_idx])
    pred_xgb += clf.predict_proba(X_test) / folds

[0]	validation_0-mlogloss:1.08342	validation_1-mlogloss:1.08386
Multiple eval metrics have been passed: 'validation_1-mlogloss' will be used for early stopping.

Will train until validation_1-mlogloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[431]	validation_0-mlogloss:0.410773	validation_1-mlogloss:0.685429

[0]	validation_0-mlogloss:1.08412	validation_1-mlogloss:1.08501
Multiple eval metrics have been passed: 'validation_1-mlogloss' will be used for early stopping.

Will train until validation_1-mlogloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[496]	validation_0-mlogloss:0.388813	validation_1-mlogloss:0.654976

[0]	validation_0-mlogloss:1.0834	validation_1-mlogloss:1.0839
Multiple eval metrics have been passed: 'validation_1-mlogloss' will be used for early stopping.

Will train until validation_1-mlogloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[474]	validation_0-mlogloss:0.407653	validation_1-mlogloss:0.624729

[0]	validation_0-mloglos

In [None]:
log_loss(y_train, oof*0.5 + oof_xgb*0.5)

0.6669427983066565

In [None]:
log_loss(y_train, oof*0.5 + oof_xgb*0.4 + oof_nn*.1)

0.6699075716823077

In [None]:
sub.loc[:, '0':] = pred*0.5 + pred_xgb*0.4 + pred_nn*0.1
sub.to_csv('submission_x_l_n.csv', index=False)
sub

Unnamed: 0,index,0,1,2
0,26457,0.065058,0.161051,0.773892
1,26458,0.300673,0.274650,0.424677
2,26459,0.036857,0.100426,0.862717
3,26460,0.060528,0.080429,0.859043
4,26461,0.081846,0.266709,0.651445
...,...,...,...,...
9995,36452,0.079717,0.263553,0.656729
9996,36453,0.260034,0.283022,0.456943
9997,36454,0.044962,0.135788,0.819250
9998,36455,0.152063,0.299943,0.547993


In [None]:
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import MinMaxScaler

FOLDS = 5
RANDOM_STATE = 0

gk = GroupKFold(n_splits=FOLDS)
oof_nn = np.zeros([len(train), 3])
pred_nn = np.zeros([len(test), 3])

features = [col for col in test.columns if col not in drop_col]

scaler = MinMaxScaler()
scaler.fit(train[features])

for idx, (trn_idx, val_idx) in enumerate(gk.split(train, groups=train['rank'])):
    X_train = scaler.transform(train[features])
    X_test = scaler.transform(test[features])
    y_train = train[TARGET].values

    clf = MLPClassifier(hidden_layer_sizes=(2048, 2), max_iter=2000, warm_start=True, momentum=0.01, early_stopping=True,
                         random_state=RANDOM_STATE,
                        )
    clf.fit(X_train[trn_idx], y_train[trn_idx])

    oof_nn[val_idx] = clf.predict_proba(X_train[val_idx])
    pred_nn += clf.predict_proba(X_test) / FOLDS
    print(idx, 'fold complete ---\n')

log_loss(y_train, oof_nn)

0 fold complete ---

1 fold complete ---

2 fold complete ---

3 fold complete ---

4 fold complete ---



0.9168684420194056

In [None]:
oof_nn = oof.copy()
pred_nn = pred.copy()

log_loss(y_train, oof)

0.9048024576665183

## lgb

In [8]:
FOLDS = 5
RANDOM_STATE = 0

skf = StratifiedKFold(n_splits=FOLDS, random_state=RANDOM_STATE, shuffle=True)
gk = GroupKFold(n_splits=FOLDS)
kf = KFold(n_splits=FOLDS, random_state=RANDOM_STATE, shuffle=True)

oof = np.zeros([len(train), 3])
pred = np.zeros([len(test), 3])
lgb_models = {}

params = {'dart':1000, 'gbdt':5000, 'rf':5000, 'goss':5000}
learn_type = 'gbdt'

features = [col for col in test.columns if col not in drop_col]

# for idx, (trn_idx, val_idx) in enumerate(gk.split(train, groups=train['rank'])):
# for idx, (trn_idx, val_idx) in enumerate(gk.split(train, groups=train['rank'].astype(str) + '_' + train['DAYS_BIRTH_DATE'].dt.year.astype(str))):
for idx, (trn_idx, val_idx) in enumerate(skf.split(train, y=train['credit'])):
    X_train = train[features]
    X_test = test[features]
    y_train = train[TARGET].values

    # print(train.loc[val_idx]['rank'].value_counts())

    clf = LGBMClassifier(
                         boosting_type=learn_type,
                         objective='multiclass', #['binary', 'multiclass', 'multiclassova']
                         num_calsss=3, 
                         metrics='multi_logloss', #['log_loss', 'multi_logloss', 'multi_error']
                         n_estimators=params[learn_type],
                         max_depth=32,
                         learning_rate=0.03,
                         colsample_bytree=0.5,
                         subsample=0.7,
                         num_leaves=256,
                         reg_alpha=0.01,
                         reg_lambda=0.01,
                         random_state=RANDOM_STATE,
                        )
    # evals = [(pd.concat([X_train.loc[trn_idx], X_train2]).reset_index(drop=True),
    #           np.concatenate([y_train[trn_idx], y_train2])
    #                          ), (X_train.loc[val_idx], y_train[val_idx])]
    # clf.fit(pd.concat([X_train.loc[trn_idx], X_train2]).reset_index(drop=True),
    #           np.concatenate([y_train[trn_idx], y_train2]), eval_set=evals, early_stopping_rounds=100, verbose=500)
    evals = [(X_train.loc[trn_idx],y_train[trn_idx]), (X_train.loc[val_idx], y_train[val_idx])]
    clf.fit(X_train.loc[trn_idx], y_train[trn_idx], eval_set=evals, early_stopping_rounds=100, verbose=500)
    
    if idx==0: feature_importances = clf.feature_importances_ / FOLDS
    else: feature_importances += clf.feature_importances_ / FOLDS

    oof[val_idx] = clf.predict_proba(X_train.loc[val_idx])
    pred += clf.predict_proba(X_test) / FOLDS

    lgb_models[idx] = clf
    print(idx+1, 'fold complete ################################\n')

    # if idx==4:
    #     break

log_loss(y_train, oof)

Training until validation scores don't improve for 100 rounds.
Early stopping, best iteration is:
[164]	valid_0's multi_logloss: 0.409173	valid_1's multi_logloss: 0.697694
1 fold complete ################################

Training until validation scores don't improve for 100 rounds.
Early stopping, best iteration is:
[173]	valid_0's multi_logloss: 0.403528	valid_1's multi_logloss: 0.677376
2 fold complete ################################

Training until validation scores don't improve for 100 rounds.
Early stopping, best iteration is:
[194]	valid_0's multi_logloss: 0.384438	valid_1's multi_logloss: 0.670525
3 fold complete ################################

Training until validation scores don't improve for 100 rounds.
Early stopping, best iteration is:
[172]	valid_0's multi_logloss: 0.402971	valid_1's multi_logloss: 0.690319
4 fold complete ################################

Training until validation scores don't improve for 100 rounds.
Early stopping, best iteration is:
[176]	valid_0'

0.6854585631568699

In [None]:
# 'category' - reality, family_type
# 5fold best : 0.6714943537141546
# 10fold best : 0.6687885953231191
# add 100kmeans / add 100kmeans gb bm / add 100kmeans gb bmy / add 100kmeans gb bmm : 0.6678719604994495 / 0.6672104903128505 / 0.66793124490285 / 0.6684939580526221
log_loss(y_train, oof)

0.6687236534205659

In [None]:
# seed 42
oof1 = oof.copy()
pred1 = pred.copy()
log_loss(y_train, oof1)

0.6695582677962895

In [None]:
# seed 0
oof2 = oof.copy()
pred2 = pred.copy()
log_loss(y_train, oof2)

0.6687236534205659

In [None]:
pd.DataFrame({'value':feature_importances, 
              'feature':test[features].columns}).sort_values('value', ascending=False)

Unnamed: 0,value,feature
9,8992.6,DAYS_BIRTH
34,8493.4,DAYS_BIRTH_DAYS_EMPLOYED_begin_month_month_std
15,8081.0,begin_month
28,7959.4,DAYS_BIRTH_DAYS_EMPLOYED_begin_month_mean
29,7790.0,DAYS_BIRTH_DAYS_EMPLOYED_begin_month_std
33,7349.2,DAYS_BIRTH_DAYS_EMPLOYED_begin_month_month_mean
35,7278.4,cluster
27,7048.2,employment_days
4,6775.4,income_total
20,6432.4,DAYS_BIRTH_week


In [None]:
(pred/5*8).sum(1)

array([1., 1., 1., ..., 1., 1., 1.])

In [None]:
log_loss(df_train[TARGET], (oof1*.5 + oof2*.5)*.3 + oof3*.7)

In [None]:
sub.loc[:, '0':] = pred*5
sub.to_csv('submission_holdout.csv', index=False)
sub

Unnamed: 0,index,0,1,2
0,26457,0.076873,0.143967,0.779161
1,26458,0.170763,0.334556,0.494680
2,26459,0.022345,0.051508,0.926146
3,26460,0.071054,0.058465,0.870481
4,26461,0.051354,0.310933,0.637713
...,...,...,...,...
9995,36452,0.061794,0.104031,0.834175
9996,36453,0.229621,0.344777,0.425602
9997,36454,0.029437,0.079820,0.890743
9998,36455,0.235205,0.429368,0.335427


In [None]:
sub.loc[:, '0':] = pred
sub.to_csv('submission_10fold_vc.csv', index=False)
sub

Unnamed: 0,index,0,1,2
0,26457,0.081060,0.171615,0.747325
1,26458,0.369734,0.285064,0.345202
2,26459,0.032977,0.087273,0.879750
3,26460,0.070734,0.069845,0.859421
4,26461,0.057851,0.240005,0.702144
...,...,...,...,...
9995,36452,0.104379,0.295846,0.599775
9996,36453,0.218298,0.235430,0.546272
9997,36454,0.043793,0.115590,0.840617
9998,36455,0.156328,0.250512,0.593160


In [None]:
df_train = pd.read_csv('open/train.csv')
df_test = pd.read_csv('open/test.csv')
sub = pd.read_csv('open/sample_submission.csv')

# df_train = df_train.sort_values('begin_month', ascending=True).reset_index(drop=True)
# df_test = df_test.sort_values('begin_month', ascending=True).reset_index(drop=True)

df_train.loc[df_train['child_num'] > df_train['family_size'], 'family_size'] = 4
df_train.loc[(df_train['family_type']=='Married') & (df_train['family_size']<2), 'family_size'] = 3
df_train.loc[df_train['child_num'] == df_train['family_size'], 'family_size'] = 2

df_test.loc[df_test['child_num'] > df_test['family_size'], 'family_size'] = 4
df_test.loc[(df_test['family_type']=='Married') & (df_test['family_size']<2), 'family_size'] = 3

# temp = []
# duplicates = []
# for idx in tqdm(range(len(df_train))):
    # row = '_'.join(df_train.loc[idx, 'gender':'family_size'].astype(str).tolist())
#     if row in temp:
#         duplicates.append(sum([row==i for i in temp]) + 1)
#     else:
#         duplicates.append(1)
    # temp.append(row)

# df_train['rank'] = duplicates
# df_train['uid_rows'] = temp

# duplicates = []
# for idx in tqdm(range(len(df_test))):
#     row = '_'.join(df_test.loc[idx, 'gender':'family_size'].astype(str).tolist())
#     if row in temp:
#         duplicates.append(sum([row==i for i in temp]) + 1)
#     else:
#         duplicates.append(1)
#     temp.append(row)

# df_test['rank'] = duplicates
# df_test['uid_rows'] = temp[-10000:]
df_train['uid_rows'] = ['_'.join(df_train.loc[idx, 'gender':'family_size'].astype(str).tolist()) for idx in tqdm(range(len(df_train)))]
df_test['uid_rows'] = ['_'.join(df_test.loc[idx, 'gender':'family_size'].astype(str).tolist()) for idx in tqdm(range(len(df_test)))]

df_train = pd.concat([df_train, df_train.groupby('uid_rows')[['index']].rank().rename(columns={'index':'rank'})], 1)
df_test['rank'] = 0

train = df_train.copy()
test = df_test.copy()

# 부양가족
train['n_dependents2'] = train['family_size'] - train['child_num']
test['n_dependents2'] = test['family_size'] - test['child_num']

TARGET = 'credit'

for col in ['gender', 'car', 'reality', 'edu_type', 'house_type', 'occyp_type', 'income_type', 'family_type', 'work_phone', 'phone', 'email']:
# for col in ['gender', 'car', 'reality', 'child_num', 'edu_type', 'house_type', 'occyp_type', 'income_type', 'family_type', 'work_phone', 'phone', 'email', 'family_size']:
    temp = train[col].value_counts(True).to_dict()
    for df in [train, test]:
        df[col] = df[col].map(temp)

# test['uid_rows_vc'] = test['uid_rows'].map(train['uid_rows'].value_counts())
# train['uid_rows_vc'] = train['uid_rows'].map(train['uid_rows'].value_counts())

train['DAYS_EMPLOYED_binary'] = train['DAYS_EMPLOYED']>0
test['DAYS_EMPLOYED_binary'] = test['DAYS_EMPLOYED']>0

train['DAYS_BIRTH_DATE'] = train['DAYS_BIRTH'].abs().map({k:v for k,v in enumerate(pd.date_range(end='2019-12-31', periods=25153, freq='d')[::-1])})
test['DAYS_BIRTH_DATE'] = test['DAYS_BIRTH'].abs().map({k:v for k,v in enumerate(pd.date_range(end='2019-12-31', periods=25153, freq='d')[::-1])})

# 미고용 상태 NULL
train['DAYS_EMPLOYED_DATE'] = train['DAYS_EMPLOYED'].abs().map({k:v for k,v in enumerate(pd.date_range(end='2019-12-31', periods=15714, freq='d')[::-1])})
test['DAYS_EMPLOYED_DATE'] = test['DAYS_EMPLOYED'].abs().map({k:v for k,v in enumerate(pd.date_range(end='2019-12-31', periods=15714, freq='d')[::-1])})
train['DAYS_EMPLOYED_DATE'] = pd.to_datetime(train['DAYS_EMPLOYED_DATE'].fillna('2020-01-01'))
test['DAYS_EMPLOYED_DATE'] = pd.to_datetime(test['DAYS_EMPLOYED_DATE'].fillna('2020-01-01'))


train['begin_month_DATE'] = train['begin_month'].abs().map({k:v for k,v in enumerate(pd.date_range(end='2019-12-31', periods=61, freq='m')[::-1])})
test['begin_month_DATE'] = test['begin_month'].abs().map({k:v for k,v in enumerate(pd.date_range(end='2019-12-31', periods=61, freq='m')[::-1])})

for df in [train, test]:

    # 2019년 기준 현재 나이, 정년까지 남은 연수
    df['age'] = 2019 - df['DAYS_BIRTH_DATE'].dt.year
    df['age2'] = 60 - df['age']
    df['income_total_y'] = df['income_total'] * (df['DAYS_EMPLOYED_DATE'].dt.year + df['DAYS_EMPLOYED_DATE'].dt.month)

    df['DAYS_BIRTH_month'] = df['DAYS_BIRTH_DATE'].dt.month
    df['DAYS_BIRTH_week'] = df['DAYS_BIRTH_DATE'].dt.isocalendar().week.astype(int)
    df['DAYS_BIRTH_day'] = df['DAYS_BIRTH_DATE'].dt.day
    df['DAYS_BIRTH_weekday'] = df['DAYS_BIRTH_DATE'].dt.weekday

    df['DAYS_EMPLOYED_DATE_month'] = df['DAYS_EMPLOYED_DATE'].dt.month
    df['DAYS_EMPLOYED_DATE_week'] = df['DAYS_EMPLOYED_DATE'].dt.isocalendar().week.astype(int)
    df['DAYS_EMPLOYED_DATE_day'] = df['DAYS_EMPLOYED_DATE'].dt.day
    df['DAYS_EMPLOYED_DATE_weekday'] = df['DAYS_EMPLOYED_DATE'].dt.weekday

    df['employment_days'] = (df['DAYS_BIRTH_DATE'] - df['DAYS_EMPLOYED_DATE']).dt.days
    df['employment_days'] = df['employment_days']//12

    df.loc[df['DAYS_EMPLOYED_DATE']=='2020-01-01', ['DAYS_EMPLOYED_DATE_month', 'DAYS_EMPLOYED_DATE_week', 'DAYS_EMPLOYED_DATE_day', 'DAYS_EMPLOYED_DATE_weekday', 'employment_days']] = -1

    df['begin_month_year'] = df['begin_month_DATE'].dt.year
    df['begin_month_month'] = df['begin_month_DATE'].dt.month
    
def create_features(train, test, uid, feature, aggs):
    tr, te = train.copy(), test.copy()
    
    if len(uid)==3:
        uid1, uid2, uid3 = uid[0], uid[1], uid[2]
        tr['uid'] = tr[uid1].astype(str) + '_' + tr[uid2].astype(str) + '_' + tr[uid3].astype(str)
        te['uid'] = te[uid1].astype(str) + '_' + te[uid2].astype(str) + '_' + te[uid3].astype(str)

        for agg in aggs:
            new_col = f'{uid1}_{uid2}_{uid3}_{feature}_{agg}'
            tr[new_col] = tr['uid'].map(tr.groupby('uid')[feature].agg(agg))
            te[new_col] = te['uid'].map(tr.groupby('uid')[feature].agg(agg))
    
    elif len(uid)==2:
        uid1, uid2 = uid[0], uid[1]
        tr['uid'] = tr[uid1].astype(str) + '_' + tr[uid2].astype(str)
        te['uid'] = te[uid1].astype(str) + '_' + te[uid2].astype(str)

        for agg in aggs:
            new_col = f'{uid1}_{uid2}_{feature}_{agg}'
            tr[new_col] = tr['uid'].map(tr.groupby('uid')[feature].agg(agg))
            te[new_col] = te['uid'].map(tr.groupby('uid')[feature].agg(agg))
    else:
        uid1 = uid[0]
        tr['uid'] = tr[uid1].astype(str) + '_'
        te['uid'] = te[uid1].astype(str) + '_'

        for agg in aggs:
            new_col = f'{uid1}__{feature}_{agg}'
            tr[new_col] = tr['uid'].map(tr.groupby('uid')[feature].agg(agg))
            te[new_col] = te['uid'].map(tr.groupby('uid')[feature].agg(agg))

    tr = tr.drop(columns='uid')
    te = te.drop(columns='uid')
    temp = tr[new_col].value_counts(True).to_dict()
    for df in [tr, te]:
        df[new_col] = df[new_col].map(temp)

    return tr, te

train, test = create_features(train, test, ['DAYS_BIRTH', 'DAYS_EMPLOYED'], 'begin_month', ['mean', 'std', 'max'])
train, test = create_features(train, test, ['DAYS_BIRTH', 'DAYS_EMPLOYED'], 'begin_month_year', ['mean', 'std'])
train, test = create_features(train, test, ['DAYS_BIRTH', 'DAYS_EMPLOYED'], 'begin_month_month', ['mean', 'std'])

# train, test = create_features(train, test, ['cluster'], 'begin_month', ['mean'])
# train, test = create_features(train, test, ['occyp_type'], 'income_total', ['mean'])
# train, test = create_features(train, test, ['cluster'], 'income_total', ['mean'])

train = train.fillna(-99)
test = test.fillna(-99)

from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder
from sklearn.cluster import KMeans, AgglomerativeClustering

cluster_train = df_train.copy()
cluster_test = df_test.copy()
for col in ['gender', 'car', 'reality', 'edu_type', 'house_type', 'occyp_type', 'income_type', 'family_type', 'work_phone', 'phone', 'email']:
    cluster_train = pd.concat([cluster_train, pd.get_dummies(cluster_train[col], prefix=col)], 1)
    cluster_test = pd.concat([cluster_test, pd.get_dummies(cluster_test[col], prefix=col)], 1)
    cluster_train = cluster_train.drop(columns=col)
    cluster_test = cluster_test.drop(columns=col)

cluster_train.drop(columns='credit', inplace=True)
cluster_train.drop(columns=['index', 'rank', 'uid_rows'], inplace=True)
cluster_test.drop(columns=['index', 'rank', 'uid_rows'], inplace=True)

scaler = MinMaxScaler()
scaler.fit(cluster_train.values)

cluster_train = scaler.transform(cluster_train.values)
cluster_test = scaler.transform(cluster_test.values)

km = KMeans(n_clusters=100, random_state=42, max_iter=1000, )
# km = AgglomerativeClustering(n_clusters=100, )

km.fit(cluster_train)

km_train = km.predict(cluster_train)
km_test = km.predict(cluster_test)

train['cluster'] = km_train
test['cluster'] = km_test

drop_col = ['index', 'FLAG_MOBIL', 'DAYS_BIRTH_DATE', 'DAYS_EMPLOYED_DATE', 'begin_month_DATE', 'DAYS_EMPLOYED_binary', 'DAYS_EMPLOYED', 'DAYS_BIRTH',
            'begin_month_year', 'begin_month_month', 'uid_rows', 'rank', 'income_total_y',]

In [None]:
# df_train['rank'] = 0
# temp_dict = {}
# for i in df_train.index:

#     key = '_'.join(df_train.loc[i, :'family_size'].astype(str))
#     try:
#         temp_dict[key]
#         temp_dict[key] += 1
#     except:
#         temp_dict[key] = 0
    
#     df_train.loc[i, 'rank'] = temp_dict[key]
# df_test['rank']=0

# df_train['duplicated'] = df_train.loc[:, :'begin_month'].duplicated().astype(int)
# df_test['duplicated'] = df_test.loc[:, :'begin_month'].duplicated().astype(int)

In [None]:
test = train[22000:].reset_index(drop=True)
train = train[:22000].reset_index(drop=True)

In [None]:
test = test.drop(columns='credit')

In [None]:
skf = StratifiedKFold(n_splits=5, random_state=0, shuffle=True)

oof = np.zeros([len(train), 2])
pred = np.zeros([len(test), 2])
lgb_models = {}

params = {'dart':100, 'gbdt':5000, 'rf':5000, 'goss':5000}
learn_type = 'gbdt'

features = [col for col in test.columns if col not in drop_col]

for idx, (trn_idx, val_idx) in enumerate(skf.split(train, train['credit'])):
    X_train = train[features]
    X_test = test[features]
    y_train = train[TARGET].map({0:0, 1:1, 2:1}).values

    clf = LGBMClassifier(
                         boosting_type=learn_type,
                         objective='binary', #['binary', 'multiclass', 'multiclassova']
                         metrics='log_loss', #['log_loss', 'multi_logloss', 'multi_error']
                         n_estimators=params[learn_type],
                         max_depth=64,
                         learning_rate=0.03,
                         colsample_bytree=0.5,
                         subsample=0.7,
                         num_leaves=256,
                         reg_alpha=0.01,
                         reg_lambda=0.01,
                         random_state=0,
                        )
    evals = [(X_train.loc[trn_idx], y_train[trn_idx]), (X_train.loc[val_idx], y_train[val_idx])]
    clf.fit(X_train.loc[trn_idx], y_train[trn_idx], eval_set=evals, eval_metric='logloss', early_stopping_rounds=100, verbose=500)

    if idx==0: feature_importances = clf.feature_importances_/5
    else: feature_importances += clf.feature_importances_/5

    oof[val_idx] = clf.predict_proba(X_train.loc[val_idx], num_iteration=clf.best_iteration_)
    pred += clf.predict_proba(X_test, num_iteration=clf.best_iteration_) / 5

    lgb_models[idx] = clf
    print(idx, 'fold complete ################################\n')

log_loss(y_train, oof)

Training until validation scores don't improve for 100 rounds.
Early stopping, best iteration is:
[82]	valid_0's binary_logloss: 0.196579	valid_1's binary_logloss: 0.31414
0 fold complete ################################

Training until validation scores don't improve for 100 rounds.
Early stopping, best iteration is:
[101]	valid_0's binary_logloss: 0.179188	valid_1's binary_logloss: 0.312159
1 fold complete ################################

Training until validation scores don't improve for 100 rounds.
Early stopping, best iteration is:
[115]	valid_0's binary_logloss: 0.167826	valid_1's binary_logloss: 0.309396
2 fold complete ################################

Training until validation scores don't improve for 100 rounds.
Early stopping, best iteration is:
[72]	valid_0's binary_logloss: 0.206537	valid_1's binary_logloss: 0.325476
3 fold complete ################################

Training until validation scores don't improve for 100 rounds.
Early stopping, best iteration is:
[88]	vali

0.31699643690860324

In [None]:
train = pd.concat([train, pd.DataFrame(oof, columns=['p1', 'p2'])], 1)
test = pd.concat([test, pd.DataFrame(pred, columns=['p1', 'p2'])], 1)

In [None]:
# train['credit'] = train['credit'].map({0:0, 1:1, 2:1})
# train = train[train[TARGET] != -1].reset_index(drop=True)

target_len = train[TARGET].nunique()
folds = 5

skf = StratifiedKFold(n_splits=folds, random_state=0, shuffle=True)
gk = GroupKFold(n_splits=folds)
kf = KFold(n_splits=folds, random_state=0, shuffle=True)

oof = np.zeros([len(train), target_len])
pred = np.zeros([len(test), target_len])
lgb_models = {}

# y = train['begin_month'].astype(str) + '_' +train['credit'].astype(str)
# for trn_idx, val_idx in skf.split(train, y):

params = {'dart':1000, 'gbdt':5000, 'rf':5000, 'goss':5000}
learn_type = 'gbdt'

features = [col for col in test.columns if col not in drop_col]

# for idx, (trn_idx, val_idx) in enumerate(kf.split(train)):
# for idx, (trn_idx, val_idx) in enumerate(skf.split(train, train['credit'].map({0:0, 1:1, 2:1}))):
# for idx, (trn_idx, val_idx) in enumerate(skf.split(train, train['rank'])):
# for idx, (trn_idx, val_idx) in enumerate(gk.split(train, groups=train['prior_credit'])):
# for idx, (trn_idx, val_idx) in enumerate(skf.split(train, train['begin_month'].astype(str) + '_' + train['credit'].astype(str))):
# for idx, (trn_idx, val_idx) in enumerate(skf.split(train, train['credit'])):
for idx, (trn_idx, val_idx) in enumerate(gk.split(train, groups=train['rank'])):
    X_train = train[features]
    X_test = test[features]
    y_train = train[TARGET].values

    # print(train.loc[val_idx]['rank'].value_counts())

    clf = LGBMClassifier(
                         boosting_type=learn_type,
                         objective='multiclass', #['binary', 'multiclass', 'multiclassova']
                         num_calsss=3, 
                         metrics='multi_logloss', #['log_loss', 'multi_logloss', 'multi_error']
                         n_estimators=params[learn_type],
                         max_depth=32,
                         learning_rate=0.03,
                         colsample_bytree=0.5,
                         subsample=0.7,
                         num_leaves=256,
                         reg_alpha=0.01,
                         reg_lambda=0.01,
                         random_state=0,
                        )
    evals = [(X_train.loc[trn_idx], y_train[trn_idx]), (X_train.loc[val_idx], y_train[val_idx])]
    clf.fit(X_train.loc[trn_idx], y_train[trn_idx], eval_set=evals, early_stopping_rounds=100, verbose=500)

    if idx==0: feature_importances = clf.feature_importances_ / folds
    else: feature_importances += clf.feature_importances_ / folds

    oof[val_idx] = clf.predict_proba(X_train.loc[val_idx])
    pred += clf.predict_proba(X_test) / folds

    lgb_models[idx] = clf
    print(idx, 'fold complete ################################\n')

log_loss(y_train, oof)

In [None]:
# train['credit'] = train['credit'].map({0:0, 1:1, 2:1})
# train = train[train[TARGET] != -1].reset_index(drop=True)

target_len = train[TARGET].nunique()

skf = StratifiedKFold(n_splits=5, random_state=0, shuffle=True)
gk = GroupKFold(n_splits=5)
kf = KFold(n_splits=5, random_state=0, shuffle=True)

oof = np.zeros([len(train), target_len])
pred = np.zeros([len(test), target_len])
lgb_models = {}

# y = train['begin_month'].astype(str) + '_' +train['credit'].astype(str)
# for trn_idx, val_idx in skf.split(train, y):

params = {'dart':100, 'gbdt':5000, 'rf':5000, 'goss':5000}
learn_type = 'gbdt'

features = [col for col in test.columns if col not in drop_col]

# for idx, (trn_idx, val_idx) in enumerate(skf.split(train, train['begin_month'].astype(str) + '_' + train['credit'].astype(str))):
# for idx, (trn_idx, val_idx) in enumerate(skf.split(train, train['credit'].map({0:0, 1:1, 2:1}))):
# for idx, (trn_idx, val_idx) in enumerate(skf.split(train, train['credit'])):
# for idx, (trn_idx, val_idx) in enumerate(skf.split(train, train['rank'])):
# for idx, (trn_idx, val_idx) in enumerate(kf.split(train)):
for idx, (trn_idx, val_idx) in enumerate(gk.split(train, groups=train['rank'])):
    X_train = train[features]
    X_test = test[features]
    y_train = train[TARGET].values

    print(train.loc[val_idx]['rank'].value_counts())

    clf = LGBMClassifier(
                         boosting_type=learn_type,
                         objective='multiclass', #['binary', 'multiclass', 'multiclassova']
                         num_calsss=3, 
                         metrics='multi_logloss', #['log_loss', 'multi_logloss', 'multi_error']
                         n_estimators=params[learn_type],
                         max_depth=64,
                         learning_rate=0.03,
                         colsample_bytree=0.5,
                         subsample=0.7,
                         num_leaves=256,
                         reg_alpha=0.01,
                         reg_lambda=0.01,
                         random_state=0,
                        )
    evals = [(X_train.loc[trn_idx], y_train[trn_idx]), (X_train.loc[val_idx], y_train[val_idx])]
    clf.fit(X_train.loc[trn_idx], y_train[trn_idx], eval_set=evals, early_stopping_rounds=100, verbose=500)

    if idx==0: feature_importances = clf.feature_importances_/5
    else: feature_importances += clf.feature_importances_/5

    oof[val_idx] = clf.predict_proba(X_train.loc[val_idx], num_iteration=clf.best_iteration_)
    pred += clf.predict_proba(X_test, num_iteration=clf.best_iteration_) / 5

    lgb_models[idx] = clf
    print(idx, 'fold complete ################################\n')

log_loss(y_train, oof)

In [None]:
oof_0 = oof[:, 0]
pred_0 = pred[:, 0]

In [None]:
oof_temp = oof.copy()
pred_temp = pred.copy()

In [None]:
oof_temp[:, 0] = oof_0
pred_temp[:, 0] = pred_0

In [None]:
log_loss(df_train[TARGET], oof_temp)

0.6815677554297949

In [None]:
oof_ = np.zeros([len(train), target_len])

for iter, (i,j,k) in enumerate(oof_temp):
    if sum([i,j,k]) < 1:
        d = (1 - i - j - k)/2
        # j += d
        # k += d
    else:
        d = (i + j + k - 1)/3
        i -= d
        j -= d
        k -= d
        
    oof_[iter] = [i,j,k]
log_loss(df_train[TARGET], oof_)

In [None]:
# hyper optimization으로 찾아낸 parameter
# lightgbm dart 사용, 보다 lb 0.03 정도 좋음
# gbdt가 0.3285라면 dart는 0.3255, goss는 0.3300
lgb_param_dart = {'objective': 'multiclass', 
 'num_class': 19, 
 'boosting_type': 'dart', 
 'subsample_freq': 5, 
 'num_leaves': 92, 
 'min_data_in_leaf': 64, 
 'subsample_for_bin': 23000, 
 'max_depth': 10, 
 'feature_fraction': 0.302, 
 'bagging_fraction': 0.904, 
 'lambda_l1': 0.099, 
 'lambda_l2': 1.497, 
 'min_child_weight': 38.011, 
 'nthread': 32, 
 'metric': 'multi_logloss', 
 'learning_rate': 0.021, 
 'min_sum_hessian_in_leaf': 3, 
 'drop_rate': 0.846244, 
 'skip_drop': 0.792465, 
 'max_drop': 65,
 'seed': 42,
 'n_estimators': 1000}

In [None]:
train['phone'] = ((train['phone'] + train['work_phone'])>1).astype(int)
test['phone'] = ((test['phone'] + test['work_phone'])>1).astype(int)

In [None]:
train['uid'] = (
                train['DAYS_BIRTH'].astype(str) + '_' + 
                # train['car'].astype(str) + '_' + 
                # train['reality'].astype(str) + '_' + 
                # train['child_num'].astype(str) + '_' + 
                # train['income_type'].astype(str) + '_' + 
                # train['edu_type'].astype(str) + '_' + 
                train['DAYS_EMPLOYED'].astype(str))

test['uid'] = (
                test['DAYS_BIRTH'].astype(str) + '_' + 
                # test['car'].astype(str) + '_' + 
                # test['reality'].astype(str) + '_' + 
                # test['child_num'].astype(str) + '_' + 
                # test['income_type'].astype(str) + '_' + 
                # test['edu_type'].astype(str) + '_' + 
                test['DAYS_EMPLOYED'].astype(str))

train['ttt'] = train['uid'].map(train.groupby('uid')['begin_month'].mean()).fillna(-99)
test['ttt'] = test['uid'].map(train.groupby('uid')['begin_month'].mean()).fillna(-99)
train['tttt'] = train['uid'].map(train.groupby('uid')['begin_month'].std()).fillna(-99)
test['tttt'] = test['uid'].map(train.groupby('uid')['begin_month'].std()).fillna(-99)

# train['uid'] = (
#                 train['DAYS_BIRTH'].astype(str) + '_' + 
#                 # train['gender'].astype(str) + '_' + 
#                 # train['reality'].astype(str) + '_' + 
#                 # train['child_num'].astype(str) + '_' + 
#                 train['income_type'].astype(str) + '_' + 
#                 # train['edu_type'].astype(str) + '_' + 
#                 train['DAYS_EMPLOYED'].astype(str))

# test['uid'] = (
#                 test['DAYS_BIRTH'].astype(str) + '_' + 
#                 # test['gender'].astype(str) + '_' + 
#                 # test['reality'].astype(str) + '_' + 
#                 # test['child_num'].astype(str) + '_' + 
#                 test['income_type'].astype(str) + '_' + 
#                 # test['edu_type'].astype(str) + '_' + 
#                 test['DAYS_EMPLOYED'].astype(str))

# train['attt'] = train['uid'].map(train.groupby('DAYS_BIRTH')['begin_month'].mean())
# test['attt'] = test['uid'].map(train.groupby('DAYS_BIRTH')['begin_month'].mean()).fillna(-99)
# train['atttt'] = train['uid'].map(train.groupby('DAYS_BIRTH')['begin_month'].std())
# test['atttt'] = test['uid'].map(train.groupby('DAYS_BIRTH')['begin_month'].std()).fillna(-99)

# train['bttt'] = train['DAYS_EMPLOYED'].map(train.groupby('DAYS_EMPLOYED')['begin_month'].mean())
# test['bttt'] = test['DAYS_EMPLOYED'].map(train.groupby('DAYS_EMPLOYED')['begin_month'].mean()).fillna(-99)
# train['btttt'] = train['DAYS_EMPLOYED'].map(train.groupby('DAYS_EMPLOYED')['begin_month'].std())
# test['btttt'] = test['DAYS_EMPLOYED'].map(train.groupby('DAYS_EMPLOYED')['begin_month'].std()).fillna(-99) 

In [None]:
# from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder
# from sklearn.cluster import KMeans, AgglomerativeClustering

# cluster_train = df_train.copy()
# cluster_test = df_test.copy()
# for col in ['gender', 'car', 'reality', 'edu_type', 'house_type', 'occyp_type', 'income_type', 'family_type', 'work_phone', 'phone', 'email']:
#     cluster_train = pd.concat([cluster_train, pd.get_dummies(cluster_train[col], prefix=col)], 1)
#     cluster_test = pd.concat([cluster_test, pd.get_dummies(cluster_test[col], prefix=col)], 1)
#     cluster_train = cluster_train.drop(columns=col)
#     cluster_test = cluster_test.drop(columns=col)
# cluster_train.drop(columns='credit', inplace=True)
# # d = [col for col in cluster_train.columns if 'DATE' not in col]
# # cluster_train = cluster_train[d]
# # cluster_test = cluster_test[d]

# cluster_train = cluster_train[['DAYS_BIRTH', 'DAYS_EMPLOYED', 'begin_month']]
# cluster_test = cluster_test[['DAYS_BIRTH', 'DAYS_EMPLOYED', 'begin_month']]


# scaler = MinMaxScaler()
# scaler.fit(cluster_train.values)

# cluster_train = scaler.transform(cluster_train.values)
# cluster_test = scaler.transform(cluster_test.values)

# km = KMeans(n_clusters=100, random_state=42, max_iter=1000, )
# # km = AgglomerativeClustering(n_clusters=100, )

# km.fit(cluster_train)

# km_train = km.predict(cluster_train)
# km_test = km.predict(cluster_test)

# train['cluster'] = km_train
# test['cluster'] = km_test

In [None]:
# train = pd.concat([train, pd.get_dummies(train['child_num'], prefix='child_num')], 1)
# test = pd.concat([test, pd.get_dummies(test['child_num'], prefix='child_num')], 1)

# train = pd.concat([train, pd.get_dummies(train['begin_month'], prefix='begin_month')], 1)
# test = pd.concat([test, pd.get_dummies(test['begin_month'], prefix='begin_month')], 1)

# train = pd.concat([train, pd.get_dummies(train['edu_type'], prefix='edu_type')], 1)
# test = pd.concat([test, pd.get_dummies(test['edu_type'], prefix='edu_type')], 1)

# train = pd.concat([train, pd.get_dummies(train['house_type'], prefix='house_type')], 1)
# test = pd.concat([test, pd.get_dummies(test['house_type'], prefix='house_type')], 1)

# train = pd.concat([train, pd.get_dummies(train['occyp_type'], prefix='occyp_type')], 1)
# test = pd.concat([test, pd.get_dummies(test['occyp_type'], prefix='occyp_type')], 1)

# train = pd.concat([train, pd.get_dummies(train['income_type'], prefix='income_type')], 1)
# test = pd.concat([test, pd.get_dummies(test['income_type'], prefix='income_type')], 1)

# train = pd.concat([train, pd.get_dummies(train['family_type'], prefix='family_type')], 1)
# test = pd.concat([test, pd.get_dummies(test['family_type'], prefix='family_type')], 1)

In [None]:
    # t_features = ['t1', 't2', 't3']
    # a = train.loc[trn_idx].groupby('DAYS_BIRTH')[TARGET].value_counts(True).unstack().fillna(0.0)
    # a.columns = t_features

    # for col in t_features:
    #     X_train = np.concatenate([X_train, train['DAYS_BIRTH'].map(a[col].to_dict()).values.reshape(-1, 1)], 1)
    #     X_test = np.concatenate([X_test, test['DAYS_BIRTH'].map(a[col].to_dict()).values.reshape(-1, 1)], 1)

    # t_features = ['t4', 't5', 't6']
    # a = train.loc[trn_idx].groupby('begin_month')[TARGET].value_counts(True).unstack().fillna(0.0)
    # a.columns = t_features

    # for col in t_features:
    #     X_train = np.concatenate([X_train, train['begin_month'].map(a[col].to_dict()).values.reshape(-1, 1)], 1)
    #     X_test = np.concatenate([X_test, test['begin_month'].map(a[col].to_dict()).values.reshape(-1, 1)], 1)

    # t_features = ['t7', 't8', 't9']
    # a = train.loc[trn_idx].groupby('DAYS_BIRTH_day')[TARGET].value_counts(True).unstack().fillna(0.0)
    # a.columns = t_features

    # for col in t_features:
    #     X_train = np.concatenate([X_train, train['DAYS_BIRTH_day'].map(a[col].to_dict()).values.reshape(-1, 1)], 1)
    #     X_test = np.concatenate([X_test, test['DAYS_BIRTH_day'].map(a[col].to_dict()).values.reshape(-1, 1)], 1)

    # t_features = ['t10', 't11', 't12']
    # a = train.loc[trn_idx].groupby('DAYS_BIRTH_month')[TARGET].value_counts(True).unstack().fillna(0.0)
    # a.columns = t_features

    # for col in t_features:
    #     X_train = np.concatenate([X_train, train['DAYS_BIRTH_month'].map(a[col].to_dict()).values.reshape(-1, 1)], 1)
    #     X_test = np.concatenate([X_test, test['DAYS_BIRTH_month'].map(a[col].to_dict()).values.reshape(-1, 1)], 1)

In [None]:
# pp를 해봄

# adj_arr = []
# for i in range(oof.shape[0]):
#     temp = oof[i].copy()
#     if (oof[i]<0.1).any():
#         min = oof[i][np.argmin(oof[i])]

#         temp[np.argmin(temp)] = temp[np.argmin(temp)] - min
#         temp[np.argmax(temp)] = temp[np.argmax(temp)] + min

#     adj_arr += [temp]
# adj_arr = np.array(adj_arr)
# log_loss(y_train, adj_arr)

In [None]:
# train['edu_type'] = train['edu_type'].astype('category')
# test['edu_type'] = test['edu_type'].astype('category')

# train['income_type'] = train['income_type'].astype('category')
# test['income_type'] = test['income_type'].astype('category')

# train['family_type'] = train['family_type'].astype('category')
# test['family_type'] = test['family_type'].astype('category')

# train['house_type'] = train['house_type'].astype('category')
# test['house_type'] = test['house_type'].astype('category')

In [None]:
# train['DAYS_EMPLOYED_day'] = (train['DAYS_EMPLOYED']/24).round()
# test['DAYS_EMPLOYED_day'] = (test['DAYS_EMPLOYED']/24).round()

# train['phone2'] = train[['work_phone', 'phone']].sum(1)
# test['phone2'] = test[['work_phone', 'phone']].sum(1)

# col = 'DAYS_EMPLOYED'
# train['income_mean'] = train[col].map(train.groupby(col)['income_total'].mean().to_dict()).round(2)
# test['income_mean'] = test[col].map(train.groupby(col)['income_total'].mean().to_dict()).round(2)

# col1 = 'DAYS_BIRTH_month'
# col2 = 'DAYS_BIRTH_day'
# for df in [train, test]:
#     df['ttt'] = df[col1].astype(str) + df[col2].astype(str)
# test['ttt'] = test['ttt'].map(train['ttt'].value_counts())
# train['ttt'] = train['ttt'].map(train['ttt'].value_counts())



# train['bigin_year'] = (train['begin_month']/12).round()
# test['bigin_year'] = (test['begin_month']/12).round()

# date_dict = {i:j for i,j in zip(np.unique(train['begin_month']), pd.date_range('2014-01-01', periods=60, freq='M'))}
# train['begin_month_date'] = train['begin_month'].map(date_dict)
# test['begin_month_date'] = test['begin_month'].map(date_dict)

# train['begin_month_date_month'] = train['begin_month_date'].dt.month
# test['begin_month_date_month'] = test['begin_month_date'].dt.month

In [None]:
/usr/local/lib/python3.7/dist-packages/lightgbm/callback.py:189: UserWarning: Early stopping is not available in dart mode
  warnings.warn('Early stopping is not available in dart mode')
[100]	valid_0's multi_logloss: 0.717462	valid_1's multi_logloss: 0.795645
[200]	valid_0's multi_logloss: 0.630438	valid_1's multi_logloss: 0.748241
[300]	valid_0's multi_logloss: 0.569636	valid_1's multi_logloss: 0.72224
[400]	valid_0's multi_logloss: 0.521711	valid_1's multi_logloss: 0.703517
[500]	valid_0's multi_logloss: 0.487992	valid_1's multi_logloss: 0.696224
[600]	valid_0's multi_logloss: 0.456908	valid_1's multi_logloss: 0.689985
[700]	valid_0's multi_logloss: 0.419924	valid_1's multi_logloss: 0.683839
[800]	valid_0's multi_logloss: 0.394576	valid_1's multi_logloss: 0.682825
[900]	valid_0's multi_logloss: 0.358987	valid_1's multi_logloss: 0.682866
[1000]	valid_0's multi_logloss: 0.332996	valid_1's multi_logloss: 0.685887
0 fold complete ################################

[100]	valid_0's multi_logloss: 0.725321	valid_1's multi_logloss: 0.790456
[200]	valid_0's multi_logloss: 0.643862	valid_1's multi_logloss: 0.739061
[300]	valid_0's multi_logloss: 0.586349	valid_1's multi_logloss: 0.707981
[400]	valid_0's multi_logloss: 0.53976	valid_1's multi_logloss: 0.686057
[500]	valid_0's multi_logloss: 0.507105	valid_1's multi_logloss: 0.676175
[600]	valid_0's multi_logloss: 0.476674	valid_1's multi_logloss: 0.667773
[700]	valid_0's multi_logloss: 0.439057	valid_1's multi_logloss: 0.658935
[800]	valid_0's multi_logloss: 0.413621	valid_1's multi_logloss: 0.654829
[900]	valid_0's multi_logloss: 0.378107	valid_1's multi_logloss: 0.651918
[1000]	valid_0's multi_logloss: 0.351965	valid_1's multi_logloss: 0.652321
1 fold complete ################################

[100]	valid_0's multi_logloss: 0.729532	valid_1's multi_logloss: 0.780185
[200]	valid_0's multi_logloss: 0.648001	valid_1's multi_logloss: 0.726115
[300]	valid_0's multi_logloss: 0.590496	valid_1's multi_logloss: 0.694461
[400]	valid_0's multi_logloss: 0.544351	valid_1's multi_logloss: 0.673411
[500]	valid_0's multi_logloss: 0.512577	valid_1's multi_logloss: 0.663777
[600]	valid_0's multi_logloss: 0.48281	valid_1's multi_logloss: 0.65515
[700]	valid_0's multi_logloss: 0.44577	valid_1's multi_logloss: 0.646621
[800]	valid_0's multi_logloss: 0.420848	valid_1's multi_logloss: 0.64398
[900]	valid_0's multi_logloss: 0.38563	valid_1's multi_logloss: 0.641078
[1000]	valid_0's multi_logloss: 0.359937	valid_1's multi_logloss: 0.640073
2 fold complete ################################

[100]	valid_0's multi_logloss: 0.727141	valid_1's multi_logloss: 0.79466
[200]	valid_0's multi_logloss: 0.644628	valid_1's multi_logloss: 0.74353
[300]	valid_0's multi_logloss: 0.586057	valid_1's multi_logloss: 0.715921
[400]	valid_0's multi_logloss: 0.538595	valid_1's multi_logloss: 0.695579
[500]	valid_0's multi_logloss: 0.506456	valid_1's multi_logloss: 0.686885
[600]	valid_0's multi_logloss: 0.476331	valid_1's multi_logloss: 0.681176
[700]	valid_0's multi_logloss: 0.43846	valid_1's multi_logloss: 0.672951
[800]	valid_0's multi_logloss: 0.413368	valid_1's multi_logloss: 0.670078
[900]	valid_0's multi_logloss: 0.378338	valid_1's multi_logloss: 0.667854
[1000]	valid_0's multi_logloss: 0.352662	valid_1's multi_logloss: 0.667985
3 fold complete ################################

[100]	valid_0's multi_logloss: 0.71988	valid_1's multi_logloss: 0.815058
[200]	valid_0's multi_logloss: 0.637212	valid_1's multi_logloss: 0.770251
[300]	valid_0's multi_logloss: 0.578601	valid_1's multi_logloss: 0.746026
[400]	valid_0's multi_logloss: 0.531626	valid_1's multi_logloss: 0.729811
[500]	valid_0's multi_logloss: 0.499056	valid_1's multi_logloss: 0.723775
[600]	valid_0's multi_logloss: 0.469467	valid_1's multi_logloss: 0.719303
[700]	valid_0's multi_logloss: 0.432082	valid_1's multi_logloss: 0.715957
[800]	valid_0's multi_logloss: 0.407626	valid_1's multi_logloss: 0.714832
[900]	valid_0's multi_logloss: 0.37257	valid_1's multi_logloss: 0.716798
[1000]	valid_0's multi_logloss: 0.346343	valid_1's multi_logloss: 0.719622
4 fold complete ################################

0.6742893830660625

In [None]:
# df_train = pd.read_csv('open/train.csv')
# df_test = pd.read_csv('open/test.csv')
# sub = pd.read_csv('open/sample_submission.csv')

# # df_train = df_train.sort_values('begin_month', ascending=True).reset_index(drop=True)
# # df_test = df_test.sort_values('begin_month', ascending=True).reset_index(drop=True)

# df_train.loc[df_train['child_num'] > df_train['family_size'], 'family_size'] = 4
# df_train.loc[(df_train['family_type']=='Married') & (df_train['family_size']<2), 'family_size'] = 3
# df_train.loc[df_train['child_num'] == df_train['family_size'], 'family_size'] = 2

# df_test.loc[df_test['child_num'] > df_test['family_size'], 'family_size'] = 4
# df_test.loc[(df_test['family_type']=='Married') & (df_test['family_size']<2), 'family_size'] = 3

# # df_train = df_train.loc[df_train.loc[:, 'gender':'begin_month'].drop_duplicates().index].reset_index(drop=True)
# # df_train = df_train.loc[:, 'gender':'credit'].drop_duplicates().reset_index(drop=True)

# temp = []
# duplicates = []
# for idx in tqdm(range(len(df_train))):
#     row = '_'.join(df_train.loc[idx, 'gender':'family_size'].astype(str).tolist())
#     # row = '_'.join(df_train.loc[idx, ['DAYS_BIRTH', 'DAYS_EMPLOYED']].astype(str).tolist())
#     # row = '_'.join(df_train.loc[idx, 'car':'family_size'].astype(str).tolist())
#     if row in temp:
#         # duplicates.append(np.isin(temp, row).sum() + 1)
#         duplicates.append(sum([row==i for i in temp]) + 1)
#     else:
#         duplicates.append(1)
#     temp.append(row)

# df_train['rank'] = duplicates
# df_train['uid_rows'] = temp

# duplicates = []
# for idx in tqdm(range(len(df_test))):
#     row = '_'.join(df_test.loc[idx, 'gender':'family_size'].astype(str).tolist())
#     if row in temp:
#         # duplicates.append(np.isin(temp, row).sum() + 1)
#         duplicates.append(sum([row==i for i in temp]) + 1)
#     else:
#         duplicates.append(1)
#     temp.append(row)

# df_test['rank'] = duplicates
# df_test['uid_rows'] = temp[-10000:]

train = df_train.copy()
test = df_test.copy()

# 부양가족
# train['n_dependents'] = train['family_size'] - train['family_type'].apply(lambda x: 2 if 'Marr' in x else 1)
# test['n_dependents'] = test['family_size'] - test['family_type'].apply(lambda x: 2 if 'Marr' in x else 1)
train['n_dependents2'] = train['family_size'] - train['child_num']
test['n_dependents2'] = test['family_size'] - test['child_num']
# train['n_dependents3'] = train['n_dependents'] - train['child_num']
# test['n_dependents3'] = test['n_dependents'] - test['child_num']

TARGET = 'credit'

# categorical 
for df in [train, test]:
    df['occyp_type'].fillna('NULL', inplace=True)
    df['reality'] = df['reality'].astype('category')
    df['family_type'] = df['family_type'].astype('category')
    df['gender'] = df['gender'].map({'F':0, 'M':1})

for col in ['car', 'edu_type', 'house_type', 'occyp_type', 'income_type', 'work_phone', 'phone', 'email']:
    temp = train[col].value_counts(True).to_dict()
    for df in [train, test]:
        df[col] = df[col].map(temp)

test['uid_rows_vc'] = test['uid_rows'].map(train['uid_rows'].value_counts())
train['uid_rows_vc'] = train['uid_rows'].map(train['uid_rows'].value_counts())

train['DAYS_EMPLOYED_binary'] = train['DAYS_EMPLOYED']>0
test['DAYS_EMPLOYED_binary'] = test['DAYS_EMPLOYED']>0

train['DAYS_BIRTH_DATE'] = train['DAYS_BIRTH'].abs().map({k:v for k,v in enumerate(pd.date_range(end='2019-12-31', periods=25153, freq='d')[::-1])})
test['DAYS_BIRTH_DATE'] = test['DAYS_BIRTH'].abs().map({k:v for k,v in enumerate(pd.date_range(end='2019-12-31', periods=25153, freq='d')[::-1])})

# 미고용 상태 NULL
train['DAYS_EMPLOYED_DATE'] = train['DAYS_EMPLOYED'].abs().map({k:v for k,v in enumerate(pd.date_range(end='2019-12-31', periods=15714, freq='d')[::-1])})
test['DAYS_EMPLOYED_DATE'] = test['DAYS_EMPLOYED'].abs().map({k:v for k,v in enumerate(pd.date_range(end='2019-12-31', periods=15714, freq='d')[::-1])})
train['DAYS_EMPLOYED_DATE'] = pd.to_datetime(train['DAYS_EMPLOYED_DATE'].fillna('2020-01-01'))
test['DAYS_EMPLOYED_DATE'] = pd.to_datetime(test['DAYS_EMPLOYED_DATE'].fillna('2020-01-01'))


train['begin_month_DATE'] = train['begin_month'].abs().map({k:v for k,v in enumerate(pd.date_range(end='2019-12-31', periods=61, freq='m')[::-1])})
test['begin_month_DATE'] = test['begin_month'].abs().map({k:v for k,v in enumerate(pd.date_range(end='2019-12-31', periods=61, freq='m')[::-1])})

for df in [train, test]:

    # 2019년 기준 현재 나이, 정년까지 남은 연수
    df['age'] = 2019 - df['DAYS_BIRTH_DATE'].dt.year
    df['age2'] = 60 - df['age']
    # 2019년 기준 일한 햇수
    # df['EMPLOYED_year'] = (2019 - df['DAYS_EMPLOYED_DATE'].dt.year)
    # df['age3'] = df['age'] - df['age3']
    # df['age3'] = df['age'] - (2019 - df['begin_month_DATE'].dt.year)
    # df['loan_age'] = df['age'] - (2019 - df['begin_month_year'])
    # df['loan_age'] = df['loan_age']//10

    df['DAYS_BIRTH_week'] = df['DAYS_BIRTH_DATE'].dt.isocalendar().week.astype(int)
    df['DAYS_BIRTH_day'] = df['DAYS_BIRTH_DATE'].dt.day
    df['DAYS_BIRTH_weekday'] = df['DAYS_BIRTH_DATE'].dt.weekday

    df['DAYS_EMPLOYED_10y'] = df['DAYS_EMPLOYED_DATE'].dt.year//10
    df['DAYS_EMPLOYED_DATE_month'] = df['DAYS_EMPLOYED_DATE'].dt.month
    df['DAYS_EMPLOYED_DATE_week'] = df['DAYS_EMPLOYED_DATE'].dt.isocalendar().week.astype(int)
    df['DAYS_EMPLOYED_DATE_day'] = df['DAYS_EMPLOYED_DATE'].dt.day
    df['DAYS_EMPLOYED_DATE_weekday'] = df['DAYS_EMPLOYED_DATE'].dt.weekday

    df['employment_days'] = (df['DAYS_BIRTH_DATE'] - df['DAYS_EMPLOYED_DATE']).dt.days
    df['employment_days'] = df['employment_days']//12

    df.loc[df['DAYS_EMPLOYED_DATE']=='2020-01-01', ['DAYS_EMPLOYED_DATE_month', 'DAYS_EMPLOYED_DATE_week', 'DAYS_EMPLOYED_DATE_day', 'DAYS_EMPLOYED_DATE_weekday', 'employment_days']] = -1

    df['begin_month_year'] = df['begin_month_DATE'].dt.year
    df['begin_month_month'] = df['begin_month_DATE'].dt.month
    
    # df['diff_DAYS_EMPLOYED_begin_month_month'] = (df['begin_month_year']*12 + df['begin_month_month']) - (df['DAYS_EMPLOYED_DATE'].dt.month*12 + df['DAYS_EMPLOYED_DATE_month'])
    # df['diff_DAYS_EMPLOYED_begin_month_month'] = (df['diff_DAYS_EMPLOYED_begin_month_month']/12).round()

# for col in ['DAYS_BIRTH', 'DAYS_EMPLOYED', 'income_total']:
#     for df in [train, test]:
#         df[col] = np.log1p(df[col])

def create_features(train, test, uid, feature, aggs):
    tr, te = train.copy(), test.copy()
    
    if len(uid)==3:
        uid1, uid2, uid3 = uid[0], uid[1], uid[2]
        tr['uid'] = tr[uid1].astype(str) + '_' + tr[uid2].astype(str) + '_' + tr[uid3].astype(str)
        te['uid'] = te[uid1].astype(str) + '_' + te[uid2].astype(str) + '_' + te[uid3].astype(str)

        for agg in aggs:
            new_col = f'{uid1}_{uid2}_{uid3}_{feature}_{agg}'
            tr[new_col] = tr['uid'].map(tr.groupby('uid')[feature].agg(agg))
            te[new_col] = te['uid'].map(tr.groupby('uid')[feature].agg(agg))
    
    elif len(uid)==2:
        uid1, uid2 = uid[0], uid[1]
        tr['uid'] = tr[uid1].astype(str) + '_' + tr[uid2].astype(str)
        te['uid'] = te[uid1].astype(str) + '_' + te[uid2].astype(str)

        for agg in aggs:
            new_col = f'{uid1}_{uid2}_{feature}_{agg}'
            tr[new_col] = tr['uid'].map(tr.groupby('uid')[feature].agg(agg))
            te[new_col] = te['uid'].map(tr.groupby('uid')[feature].agg(agg))
    else:
        uid1 = uid[0]
        tr['uid'] = tr[uid1].astype(str) + '_'
        te['uid'] = te[uid1].astype(str) + '_'

        for agg in aggs:
            new_col = f'{uid1}__{feature}_{agg}'
            tr[new_col] = tr['uid'].map(tr.groupby('uid')[feature].agg(agg))
            te[new_col] = te['uid'].map(tr.groupby('uid')[feature].agg(agg))

    tr = tr.drop(columns='uid')
    te = te.drop(columns='uid')

    return tr, te

train, test = create_features(train, test, ['DAYS_BIRTH', 'DAYS_EMPLOYED'], 'begin_month', ['mean', 'std', 'max'])
train, test = create_features(train, test, ['DAYS_BIRTH', 'DAYS_EMPLOYED'], 'begin_month_year', ['mean', 'std'])
train, test = create_features(train, test, ['DAYS_BIRTH', 'DAYS_EMPLOYED'], 'begin_month_month', ['mean', 'std'])
train, test = create_features(train, test, ['uid_rows'], 'begin_month_year', ['mean'])

# train, test = create_features(train, test, ['DAYS_BIRTH'], 'begin_month_month', ['mean'])
# train, test = create_features(train, test, ['DAYS_EMPLOYED'], 'DAYS_BIRTH', ['mean'])

# train, test = create_features(train, test, ['DAYS_BIRTH', 'DAYS_EMPLOYED_binary'], 'begin_month_month', ['mean'])
# train, test = create_features(train, test, ['uid_rows'], 'begin_month_month', ['mean',])
# train, test = create_features(train, test, 'DAYS_BIRTH', 'begin_month', 'DAYS_EMPLOYED_binary', ['mean'])
# train, test = create_features(train, test, 'DAYS_BIRTH', 'DAYS_EMPLOYED_binary', 'begin_month_month', ['std'])
# train, test = create_features(train, test, 'DAYS_BIRTH', 'DAYS_EMPLOYED_DATE_month', 'begin_month_month', ['mean'])
# train, test = create_features(train, test, 'DAYS_BIRTH', 'occyp_type', 'begin_month', ['mean'])

missing_features = [col for col in train.columns if train[col].isnull().sum()>0]
print('missing_features :', missing_features)
for col in missing_features:
    train[col] = train[col].fillna(-99)
    test[col] = test[col].fillna(-99)

##### 현재 시점 2019년이라고 가정
drop_col = ['index', 'FLAG_MOBIL', 'DAYS_BIRTH_DATE', 'DAYS_EMPLOYED_DATE', 'begin_month_DATE', 'DAYS_EMPLOYED_binary', 'DAYS_EMPLOYED', 'DAYS_BIRTH',
            'begin_month_year', 'begin_month_month', 'uid_rows', 'rank']
            
