In [1]:
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')

import codecs
import json
import os
import pickle
import re
import sys
import time
import traceback

import numpy as np
import pandas as pd
import datetime as dt
import matplotlib.pyplot as plt
from glob import glob
from os.path import splitext, split, join, exists
from tqdm import tqdm
from IPython import display

In [2]:
from model_tools.gentools import gentools as gntls
from model_tools.gentools import advtools as advtls
from model_tools.plot import feature_plot as fplt
from model_tools.feature_select import feature_select as fs
from model_tools.feature_proc import feature_proc as fp
from model_tools.model import model as mdl

### read data

In [3]:
target = 'target'
dpath = 'E:/git/test_data/'
cfg_path = 'E:/git/model_tools/notebook_demo/'
exclude_cols = ['OBJECTNO',
 'target',
 'PUTOUTSERIALNO',
 'CUSTOMERID',
 'order_create_time',
 'order_create_at',
 'SERIALNO',
 'NATURALOCCURDATE',
 'APPLSERIALNO',
 'ruleSERIALNO',
 'LK_MOBILENO',
 'RULESERIALNO',
 'JIAOYIRULE_SERIALNO',
 'ref_time']

In [4]:
x_train = pd.read_pickle(join(dpath, 'x_train_afsel.pkl'))
x_test = pd.read_pickle(join(dpath, 'x_test_afsel.pkl'))
oot = pd.read_pickle(join(dpath, 'oot_afsel.pkl'))
print(x_train.shape,x_test.shape,oot.shape)
print(x_train[target].mean(),x_test[target].mean(),oot[target].mean())
x_train.head(2)

(70224, 512) (30096, 512) (34251, 512)
0.059139325586694005 0.05911084529505582 0.053166330910046425


Unnamed: 0,ely_gt1_repay_amt_rf_repay_max_p7d,ovd_days_rf_due_max_freq_p7d,normal_days_rf_repay_median_p7d,repay_amt_in_table_rf_repay_max_p96d,normal_repay_amt_rf_repay_sum_p96d,ovd_days_rf_due_last_p36d,normal_repay_rto_p8repayment,LK_CUSTOMERLEVEL,normal_days_rf_repay_max_p36d,JY_CREDITMODE,...,JY_RM0009,ely_gt0_repay_amt_rf_due_max_p186d,ely_gt0_repay_amt_rf_due_sum_p186d,ovd_days_rf_repay_max_freq_p7d,ovd_gt0_repay_amt_rf_due_rto_p66d,ely_gt0_repay_amt_rf_due_max_p7d,ely_gt1_repay_rto_rf_repay_p7d,order_create_at,ovd_gt1_repay_amt_rf_due_rto_p36d,normal_repay_amt_rf_due_max_p36d
0,,0.0,,3261.81,7905.13,0.0,1.0,D1,5.0,,...,,3261.81,4898.9,,0.0,3261.81,,1587603956,0.0,3261.81
1,0.0,0.0,0.0,3006.23,3006.23,0.0,1.0,D1,0.0,,...,,0.0,0.0,0.0,0.0,0.0,0.0,1584924476,0.0,3006.23


### proc

In [5]:
for v in [x_train,x_test,oot]:
    for fea in ['ely_gt0_repay_amt_rf_due_max_p66d','ely_gt0_repay_amt_rf_due_max_p96d','ely_gt0_repay_amt_rf_due_max_p186d',
                'ely_gt0_repay_amt_rf_due_mean_p66d','ely_gt0_repay_amt_rf_due_mean_p96d','ely_gt0_repay_amt_rf_due_mean_p186d',
                'ely_gt0_repay_amt_rf_due_sum_p66d','ely_gt0_repay_amt_rf_due_sum_p96d','ely_gt0_repay_amt_rf_due_sum_p186d',
                'ely_gt0_repay_amt_rf_repay_max_p66d','ely_gt0_repay_amt_rf_repay_max_p96d','ely_gt0_repay_amt_rf_repay_max_p186d',
                'ely_gt0_repay_amt_rf_repay_rto_p66d','ely_gt0_repay_amt_rf_repay_rto_p96d','ely_gt0_repay_amt_rf_repay_rto_p186d',
                'ely_gt0_repay_amt_rf_repay_sum_p66d','ely_gt0_repay_amt_rf_repay_sum_p96d','ely_gt0_repay_amt_rf_repay_sum_p186d',
                'ely_gt0_repay_cnt_rf_repay_p96d','ely_gt0_repay_cnt_rf_repay_p186d','ely_gt0_repay_oid_rto_rf_due_p96d',
                'ely_gt0_repay_oid_rto_rf_due_p186d','ely_gt0_repay_oid_rto_rf_repay_p96d','ely_gt0_repay_oid_rto_rf_repay_p186d',
                'ely_gt0_repay_rto_rf_due_p66d','ely_gt0_repay_rto_rf_due_p96d','ely_gt0_repay_rto_rf_due_p186d',
                'ely_gt0_repay_rto_rf_repay_p96d','ely_gt0_repay_rto_rf_repay_p186d','ely_gt1_repay_amt_rf_due_max_p96d',
                'ely_gt1_repay_amt_rf_due_max_p186d','ely_gt1_repay_amt_rf_due_mean_p96d','ely_gt1_repay_amt_rf_due_mean_p186d',
                'ely_gt1_repay_amt_rf_due_sum_p96d','ely_gt1_repay_amt_rf_due_sum_p186d','ely_gt1_repay_amt_rf_repay_max_p96d',
                'ely_gt1_repay_amt_rf_repay_max_p186d','ely_gt1_repay_amt_rf_repay_sum_p66d','ely_gt1_repay_amt_rf_repay_sum_p96d',
                'ely_gt1_repay_amt_rf_repay_sum_p186d','ely_gt1_repay_cnt_rf_repay_p96d','ely_gt1_repay_cnt_rf_repay_p186d',
                'ely_gt3_repay_amt_rf_due_mean_p186d','ely_gt3_repay_amt_rf_repay_sum_p96d','ely_gt3_repay_amt_rf_repay_sum_p186d',
                'ely_repay_rto_p1repayment','ely_repay_rto_p2repayment','ely_repay_rto_p5repayment','ely_repay_rto_p7repayment',
                'ely_repay_rto_p9repayment','ely_repay_rto_p10repayment','ely_repay_rto_p15repayment','normal_days_rf_due_median_p66d',
                'normal_days_rf_due_median_p96d','normal_days_rf_due_median_p186d','normal_days_rf_repay_max_p96d',
                'normal_days_rf_repay_median_p186d','normal_days_rf_repay_std_p96d','normal_days_rf_repay_std_p186d',
                'normal_days_rf_repay_sum_p96d','normal_repay_cnt_rf_repay_p186d','ovd_days_rf_due_last_p7d','ovd_days_rf_due_last_p36d',
                'ovd_days_rf_due_last_p66d','ovd_days_rf_due_last_p96d','ovd_days_rf_due_last_p186d','ovd_days_rf_due_max_freq_p36d',
                'ovd_days_rf_due_max_freq_p66d','ovd_days_rf_due_max_freq_p96d','ovd_days_rf_due_max_freq_p186d',
                'ovd_days_rf_due_max_p7d','ovd_days_rf_due_max_p36d','ovd_days_rf_due_max_p66d','ovd_days_rf_due_max_p96d',
                'ovd_days_rf_due_min_p7d','ovd_days_rf_due_min_p36d','ovd_days_rf_due_min_p66d','ovd_days_rf_due_min_p96d',
                'ovd_days_rf_due_min_p186d','ovd_days_rf_due_sum_p7d','ovd_days_rf_repay_last_p7d','ovd_days_rf_repay_last_p36d',
                'ovd_days_rf_repay_last_p66d','ovd_days_rf_repay_last_p96d','ovd_days_rf_repay_max_freq_p7d','ovd_days_rf_repay_max_freq_p36d',
                'ovd_days_rf_repay_max_freq_p66d','ovd_days_rf_repay_max_freq_p96d','ovd_days_rf_repay_max_p7d','ovd_days_rf_repay_max_p36d',
                'ovd_days_rf_repay_max_p66d','ovd_days_rf_repay_max_p96d','ovd_days_rf_repay_mean_p7d','ovd_days_rf_repay_mean_p36d',
                'ovd_days_rf_repay_mean_p66d','ovd_days_rf_repay_median_p36d','ovd_days_rf_repay_median_p66d','ovd_days_rf_repay_min_p7d',
                'ovd_days_rf_repay_min_p36d','ovd_days_rf_repay_min_p66d','ovd_days_rf_repay_sum_p7d','ovd_days_rf_repay_sum_p36d',
                'ovd_days_rf_repay_sum_p66d','ovd_gt0_repay_amt_rf_due_min_p7d','ovd_gt0_repay_amt_rf_due_rto_p96d',
                'ovd_gt0_repay_amt_rf_repay_max_p36d','ovd_gt0_repay_amt_rf_repay_mean_p36d','ovd_gt0_repay_amt_rf_repay_rto_p66d',
                'ovd_gt0_repay_amt_rf_repay_sum_p36d','ovd_gt0_repay_cnt_rf_due_p7d','predictscore','ovd_gt0_repay_cnt_rf_repay_p186d',
                'ovd_gt0_repay_cnt_rf_due_p186d']:
        v[fea] = v[fea].fillna(0)    
    
    v['ely_gt0_repay_amt_rf_due_rto_p66d'] = v['ely_gt0_repay_amt_rf_due_rto_p66d'].fillna(0.5)
    v['ely_gt0_repay_amt_rf_due_rto_p96d'] = v['ely_gt0_repay_amt_rf_due_rto_p96d'].fillna(0.5)
    v['ely_gt0_repay_amt_rf_due_rto_p186d'] = v['ely_gt0_repay_amt_rf_due_rto_p186d'].fillna(0.5)
    v['ely_gt0_repay_oid_rto_rf_due_p36d'] = v['ely_gt0_repay_oid_rto_rf_due_p36d'].fillna(0.5)
    v['ely_gt0_repay_rto_rf_due_p36d'] = v['ely_gt0_repay_rto_rf_due_p36d'].fillna(0.5)
    v['ely_gt0_repay_rto_rf_repay_p36d'] = v['ely_gt0_repay_rto_rf_repay_p36d'].fillna(0.5)
    v['ely_gt0_repay_rto_rf_repay_p66d'] = v['ely_gt0_repay_rto_rf_repay_p66d'].fillna(0.35)
    v['ely_gt0_repay_amt_rf_repay_max_p36d'] = v['ely_gt0_repay_amt_rf_repay_max_p36d'].fillna(2000)
    v['ely_gt0_repay_amt_rf_repay_sum_p7d'] = v['ely_gt0_repay_amt_rf_repay_sum_p7d'].fillna(1000)
    v['ely_gt0_repay_amt_rf_repay_sum_p36d'] = v['ely_gt0_repay_amt_rf_repay_sum_p36d'].fillna(1500)
    v['ely_gt1_repay_amt_rf_repay_max_p66d'] = v['ely_gt1_repay_amt_rf_repay_max_p66d'].fillna(1200)
    v['ely_gt3_repay_amt_rf_repay_std_p66d'] = v['ely_gt3_repay_amt_rf_repay_std_p66d'].fillna(80)
    v['JY_RISK_GRADE_NUM'] = v['JY_RISK_GRADE_NUM'].replace(-999,10)
    v['JY_RISK_GRADE_NUM'] = v['JY_RISK_GRADE_NUM'].fillna(11)
    v['LK_ADVICE_RATE'] = v['LK_ADVICE_RATE'].fillna('{12=0.0006, 3=0.0006, 6=0.0006, 9=0.0006}')
    v['normal_days_rf_due_max_p7d'] = v['normal_days_rf_due_max_p7d'].fillna(4)
    v['normal_days_rf_due_max_p96d'] = v['normal_days_rf_due_max_p96d'].fillna(4)
    v['normal_days_rf_due_mean_p96d'] = v['normal_days_rf_due_mean_p96d'].fillna(2)
    v['normal_days_rf_due_sum_p66d'] = v['normal_days_rf_due_sum_p66d'].fillna(4)
    v['normal_days_rf_due_sum_p96d'] = v['normal_days_rf_due_sum_p96d'].fillna(4)
    v['normal_days_rf_due_sum_p186d'] = v['normal_days_rf_due_sum_p186d'].fillna(4)
    v['normal_days_rf_repay_max_p36d'] = v['normal_days_rf_repay_max_p36d'].fillna(5)
    v['normal_days_rf_repay_mean_p7d'] = v['normal_days_rf_repay_mean_p7d'].fillna(4)
    v['normal_days_rf_repay_median_p36d'] = v['normal_days_rf_repay_median_p36d'].fillna(4)
    v['normal_days_rf_repay_median_p66d'] = v['normal_days_rf_repay_median_p66d'].fillna(4)
    v['normal_days_rf_repay_std_p66d'] = v['normal_days_rf_repay_std_p66d'].fillna(3)
    v['normal_days_rf_repay_sum_p36d'] = v['normal_days_rf_repay_sum_p36d'].fillna(55)
    v['normal_days_rf_repay_sum_p66d'] = v['normal_days_rf_repay_sum_p66d'].fillna(55)
    v['normal_repay_amt_rf_due_max_p36d'] = v['normal_repay_amt_rf_due_max_p36d'].fillna(1300)
    v['normal_repay_amt_rf_due_max_p66d'] = v['normal_repay_amt_rf_due_max_p66d'].fillna(1300)
    v['normal_repay_amt_rf_due_sum_p36d'] = v['normal_repay_amt_rf_due_sum_p36d'].fillna(1500)
    v['normal_repay_amt_rf_due_sum_p66d'] = v['normal_repay_amt_rf_due_sum_p66d'].fillna(1500)
    v['normal_repay_amt_rf_due_sum_p96d'] = v['normal_repay_amt_rf_due_sum_p96d'].fillna(1500)
    v['normal_repay_amt_rf_due_sum_p186d'] = v['normal_repay_amt_rf_due_sum_p186d'].fillna(1500)
    v['normal_repay_amt_rf_repay_max_p66d'] = v['normal_repay_amt_rf_repay_max_p66d'].fillna(1000)
    v['normal_repay_amt_rf_repay_max_p186d'] = v['normal_repay_amt_rf_repay_max_p186d'].fillna(1000)
    v['normal_repay_amt_rf_repay_sum_p7d'] = v['normal_repay_amt_rf_repay_sum_p7d'].fillna(1500)
    v['normal_repay_amt_rf_repay_sum_p66d'] = v['normal_repay_amt_rf_repay_sum_p66d'].fillna(2500)
    v['normal_repay_amt_rf_repay_sum_p96d'] = v['normal_repay_amt_rf_repay_sum_p96d'].fillna(1200)
    v['normal_repay_amt_rf_repay_sum_p186d'] = v['normal_repay_amt_rf_repay_sum_p186d'].fillna(2500)
    v['normal_repay_cnt_rf_repay_p7d'] = v['normal_repay_cnt_rf_repay_p7d'].fillna(4)
    v['normal_repay_rto_rf_repay_p186d'] = v['normal_repay_rto_rf_repay_p186d'].fillna(0.8)
    v['ord_succ_amt_max_in1d_p1d'] = v['ord_succ_amt_max_in1d_p1d'].fillna(1000)
    v['ord_succ_amt_max_in1d_p7d'] = v['ord_succ_amt_max_in1d_p7d'].fillna(1500)
    v['ord_succ_amt_sum_p7d'] = v['ord_succ_amt_sum_p7d'].fillna(2000)
    v['ovd_days_rf_repay_sum_p186d'] = v['ovd_days_rf_repay_sum_p186d'].fillna(2)
    v['ovd_gt0_repay_amt_rf_repay_rto_p96d'] = v['ovd_gt0_repay_amt_rf_repay_rto_p96d'].fillna(0.1)
    v['ovd_gt0_repay_amt_rf_repay_rto_p186d'] = v['ovd_gt0_repay_amt_rf_repay_rto_p186d'].fillna(0.15)
    v['ovd_gt0_repay_oid_rto_rf_repay_p186d'] = v['ovd_gt0_repay_oid_rto_rf_repay_p186d'].fillna(0.2)
    v['ovd_gt0_repay_rto_rf_due_p186d'] = v['ovd_gt0_repay_rto_rf_due_p186d'].fillna(0.15)
    v['ovd_repay_amt_sum_rto_p15repayment'] = v['ovd_repay_amt_sum_rto_p15repayment'].fillna(0.15)
    v['repay_amt_in_table_rf_due_max_p66d'] = v['repay_amt_in_table_rf_due_max_p66d'].fillna(1000)
    v['repay_amt_in_table_rf_due_max_p96d'] = v['repay_amt_in_table_rf_due_max_p96d'].fillna(500)
    v['repay_amt_in_table_rf_due_sum_p186d'] = v['repay_amt_in_table_rf_due_sum_p186d'].fillna(1500)
    v['repay_amt_in_table_rf_repay_max_p96d'] = v['repay_amt_in_table_rf_repay_max_p96d'].fillna(1000)
    v['repay_amt_in_table_rf_repay_sum_p186d'] = v['repay_amt_in_table_rf_repay_sum_p186d'].fillna(2000)

### test

#### clip

In [6]:
feas = 'ovd_gt1_repay_cnt_rf_due_p7d'

In [7]:
tab = fplt.TabOut()
params = {}
for v in ['x_train', 'x_test', 'oot']:
    params.update({v: fplt.PlotUtils().plot_bivar(eval(v), feas=feas, target=target,
                    cut_params=None, title=feas,
                    yaxis='count_rto', pyecharts=False, mark_line=True,
                    draw_lin=False, save_path=None, thred=-1, max_bins=9999,
                    color_bar=['steelblue'], color_line=['red','black'], return_state=False)})
    
tab.get_tab_out(params, box_style='info',column_cnt=5)

VBox(box_style='info', children=(HBox(children=(HBox(children=(Button(description='x_train', layout=Layout(hei…

In [8]:
index = 4
cfg = pd.DataFrame({'feas':feas,
            'keep_value':[np.nan],
            'flag':[[(index, 'clip_upper')]]})
display.display(cfg)

x_train_tmp = fp.add_flag(x_train[[feas]+[target]], fea_cfg=cfg.set_index('feas').to_dict('index'))
x_test_tmp = fp.add_flag(x_test[[feas]+[target]], fea_cfg=cfg.set_index('feas').to_dict('index'))
oot_tmp = fp.add_flag(oot[[feas]+[target]], fea_cfg=cfg.set_index('feas').to_dict('index'))

Unnamed: 0,feas,keep_value,flag
0,ovd_gt1_repay_cnt_rf_due_p7d,,"[(4, clip_upper)]"


100%|███████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 111.41it/s]
100%|███████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 199.15it/s]
100%|███████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 202.12it/s]


In [9]:
tab = fplt.TabOut()
params = {}
fea = f'{feas}_clipU{index}'
for v in ['x_train_tmp', 'x_test_tmp', 'oot_tmp']:
    params.update({v: fplt.PlotUtils().plot_bivar(eval(v), feas=fea, target=target,
                    cut_params=None, title=fea,
                    yaxis='count_rto', pyecharts=False, mark_line=True,
                    draw_lin=False, save_path=None, thred=-1, max_bins=9999,
                    color_bar=['steelblue'], color_line=['red','black'], return_state=False)})
    
tab.get_tab_out(params, box_style='info',column_cnt=5)

VBox(box_style='info', children=(HBox(children=(HBox(children=(Button(description='x_train_tmp', layout=Layout…

#### woe

In [10]:
# x_train.groupby(['级别'],dropna=False).agg({'OBJECTNO':len, target: 'mean'})

In [11]:
feas = 'repay_amt_in_table_rf_repay_sum_p186d'

In [12]:
tab = fplt.TabOut()
params = {}
for v in ['x_train', 'x_test', 'oot']:
    params.update({v: fplt.PlotUtils().plot_bivar(eval(v), feas=feas, target=target,
                    cut_params=10, title=feas,
                    yaxis='count_rto', pyecharts=False, mark_line=True,
                    draw_lin=False, save_path=None,  thred=-1, max_bins=9999,
                    color_bar=['steelblue'], color_line=['red','black'], return_state=False)})
    
tab.get_tab_out(params, box_style='info',column_cnt=5)

VBox(box_style='info', children=(HBox(children=(HBox(children=(Button(description='x_train', layout=Layout(hei…

In [13]:
cfg = pd.DataFrame({'feas':feas,
                    'keep_value':[np.nan],
                    'tree_params':[{'max_depth': 3, 'min_samples_leaf': 0.01, 'max_bins':6}]})
bins_cfg = fp.get_bins_bfwoe(x_train, fea_cfg=cfg.set_index('feas').to_dict("index"), target=target, save_path=None)
out, x_train_tmp = fp.woe_encode(x_train[[feas]+[target]], woe_cfg=bins_cfg, target=target, type_='calc')
out, x_test_tmp = fp.woe_encode(x_test[[feas]+[target]], woe_cfg=out, target=target, type_='nocalc')
out, oot_tmp = fp.woe_encode(oot[[feas]+[target]], woe_cfg=out, target=target, type_='nocalc')
display.display(out)

100%|████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  6.68it/s]
100%|████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  1.50it/s]
100%|████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  3.16it/s]
100%|████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  3.45it/s]


{'repay_amt_in_table_rf_repay_sum_p186d': {'bins': [-inf,
   3404.1949462890625,
   5899.030029296875,
   9657.81005859375,
   21786.26953125,
   39505.5,
   inf],
  'keep_value': nan,
  'tree_params': {'max_depth': 3, 'min_samples_leaf': 0.01, 'max_bins': 6},
  'woe_cfg': {'(39505.5, inf]': -1.676204686094938,
   '(21786.27, 39505.5]': -0.9851229551816859,
   '(9657.81, 21786.27]': -0.5356402801744277,
   '(5899.03, 9657.81]': -0.20399402679931913,
   '(3404.195, 5899.03]': 0.09302843275086148,
   '(-inf, 3404.195]': 0.32509098150746796}}}

In [14]:
tab = fplt.TabOut()
params = {}
for v in ['x_train_tmp', 'x_test_tmp', 'oot_tmp']:
    params.update({v: fplt.PlotUtils().plot_bivar(eval(v), feas=f'{feas}_afwoe', target=target,
                    cut_params=None, title=f'{feas}_afwoe',
                    yaxis='count_rto', pyecharts=False, mark_line=True,
                    draw_lin=False, save_path=None, thred=-1, max_bins=9999,
                    color_bar=['steelblue'], color_line=['red','black'], return_state=False)})
    
tab.get_tab_out(params, box_style='info',column_cnt=5)

VBox(box_style='info', children=(HBox(children=(HBox(children=(Button(description='x_train_tmp', layout=Layout…

### woe

In [6]:
cfg = pd.read_excel(join(cfg_path,'cfg','dp_cfg.xlsx'))
cfg['tree_params'] = cfg['tree_params'].apply(lambda x: eval(x) if pd.notnull(x) else np.nan)
print(cfg.shape)
cfg.head(2)

(268, 6)


Unnamed: 0,feas,keep_value,tree_params,flag,encoding_type,comment
0,DENGJI,,,,{'TS':'TS'},
1,ely_gt0_repay_amt_rf_due_max_p7d,,"{'max_depth': 3, 'min_samples_leaf': 0.02, 'ma...",,,


In [7]:
# idx = pd.notnull(cfg['tree_params'])

# woe_cfg = fp.get_bins_bfwoe(x_train, fea_cfg=cfg[idx].set_index('feas').to_dict("index"), 
#                              target=target, save_path=join(cfg_path,'cfg'))

In [8]:
with open(join(cfg_path,'cfg','tree_bins'),'r') as load_f:
    woe_cfg = json.load(load_f)

In [9]:
out, x_train = fp.woe_encode(x_train, woe_cfg=woe_cfg, target=target, type_='calc', save_path=join(cfg_path,'cfg'))
out, x_test = fp.woe_encode(x_test, woe_cfg=out, target=target, type_='nocalc')
out, oot = fp.woe_encode(oot, woe_cfg=out, target=target, type_='nocalc')
print(x_train.shape,x_test.shape,oot.shape)
print(x_train[target].mean(),x_test[target].mean(),oot[target].mean())
x_train.head(2)

100%|████████████████████████████████████████████████████████████████████████████████| 223/223 [10:11<00:00,  2.74s/it]
100%|████████████████████████████████████████████████████████████████████████████████| 223/223 [02:39<00:00,  1.40it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 223/223 [02:41<00:00,  1.38it/s]


(70224, 735) (30096, 735) (34251, 735)
0.059139325586694005 0.05911084529505582 0.053166330910046425


Unnamed: 0,ely_gt1_repay_amt_rf_repay_max_p7d,ovd_days_rf_due_max_freq_p7d,normal_days_rf_repay_median_p7d,repay_amt_in_table_rf_repay_max_p96d,normal_repay_amt_rf_repay_sum_p96d,ovd_days_rf_due_last_p36d,normal_repay_rto_p8repayment,LK_CUSTOMERLEVEL,normal_days_rf_repay_max_p36d,JY_CREDITMODE,...,ovd_repay_rto_p7repayment_afwoe,ovd_repay_rto_p8repayment_afwoe,ovd_repay_rto_p10repayment_afwoe,ovd_repay_rto_p15repayment_afwoe,predictscore_afwoe,repay_amt_in_table_rf_due_max_p66d_afwoe,repay_amt_in_table_rf_due_max_p96d_afwoe,repay_amt_in_table_rf_due_sum_p186d_afwoe,repay_amt_in_table_rf_repay_max_p96d_afwoe,repay_amt_in_table_rf_repay_sum_p186d_afwoe
0,,0.0,,3261.81,7905.13,0.0,1.0,D1,5.0,,...,-0.128414,-0.131045,-0.130742,-0.117647,-0.115052,-0.394848,-0.287955,-0.269403,-0.352477,-0.203994
1,0.0,0.0,0.0,3006.23,3006.23,0.0,1.0,D1,0.0,,...,-0.128414,-0.131045,-0.130742,-0.117647,-0.115052,-0.394848,-0.287955,0.225623,-0.352477,0.266929


In [11]:
# with open(join(cfg_path,'cfg','nwoe_cfg'), 'w') as json_file:
#     json.dump(out, json_file, ensure_ascii=False)

In [22]:
feas = 'repay_amt_in_table_rf_due_sum_p186d_afwoe'

tab = fplt.TabOut()
params = {}
for v in ['x_train', 'x_test', 'oot']:
    params.update({v: fplt.PlotUtils().plot_bivar(eval(v), feas=feas, target=target,
                    cut_params=None, title=feas,
                    yaxis='count_rto', pyecharts=False, mark_line=True,
                    draw_lin=False, save_path=None, thred=-1, max_bins=9999,
                    color_bar=['steelblue'], color_line=['red','black'], return_state=False)})
    
tab.get_tab_out(params, box_style='info',column_cnt=5)

VBox(box_style='info', children=(HBox(children=(HBox(children=(Button(description='x_train', layout=Layout(hei…

### add flag

In [23]:
cfg = pd.read_excel(join(cfg_path,'cfg','dp_cfg.xlsx'))
cfg['flag'] = cfg['flag'].apply(lambda x: eval(x) if pd.notnull(x) else np.nan)
print(cfg.shape)
cfg.head(2)

(268, 6)


Unnamed: 0,feas,keep_value,tree_params,flag,encoding_type,comment
0,DENGJI,,,,{'TS':'TS'},
1,ely_gt0_repay_amt_rf_due_max_p7d,,"{'max_depth': 3, 'min_samples_leaf': 0.02, 'ma...",,,


In [24]:
idx = pd.notnull(cfg['flag'])
x_train = fp.add_flag(x_train, fea_cfg=cfg[idx].set_index('feas').to_dict('index'))
x_test = fp.add_flag(x_test, fea_cfg=cfg[idx].set_index('feas').to_dict('index'))
oot = fp.add_flag(oot, fea_cfg=cfg[idx].set_index('feas').to_dict('index'))
print(x_train.shape,x_test.shape,oot.shape)
print(x_train[target].mean(),x_test[target].mean(),oot[target].mean())
x_train.head(2)

100%|██████████████████████████████████████████████████████████████████████████████████| 37/37 [00:22<00:00,  1.61it/s]
100%|██████████████████████████████████████████████████████████████████████████████████| 37/37 [00:09<00:00,  3.76it/s]
100%|██████████████████████████████████████████████████████████████████████████████████| 37/37 [00:11<00:00,  3.34it/s]

(70224, 772) (30096, 772) (34251, 772)
0.059139325586694005 0.05911084529505582 0.053166330910046425





Unnamed: 0,ely_gt1_repay_amt_rf_repay_max_p7d,ovd_days_rf_due_max_freq_p7d,normal_days_rf_repay_median_p7d,repay_amt_in_table_rf_repay_max_p96d,normal_repay_amt_rf_repay_sum_p96d,ovd_days_rf_due_last_p36d,normal_repay_rto_p8repayment,LK_CUSTOMERLEVEL,normal_days_rf_repay_max_p36d,JY_CREDITMODE,...,ovd_days_rf_repay_max_p36d_clipU2,ovd_days_rf_repay_max_p66d_clipU3,ovd_days_rf_repay_max_p96d_clipU3,ovd_days_rf_repay_min_p7d_clipU4,ovd_days_rf_repay_min_p36d_clipU2,ovd_days_rf_repay_min_p66d_clipU2,ovd_days_rf_repay_sum_p7d_clipU2,ovd_gt0_repay_cnt_rf_due_p7d_clipU1,ovd_gt0_repay_cnt_rf_due_p186d_clipU2,ovd_gt0_repay_cnt_rf_repay_p186d_clipU2
0,,0.0,,3261.81,7905.13,0.0,1.0,D1,5.0,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,3006.23,3006.23,0.0,1.0,D1,0.0,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [27]:
feas = 'ovd_days_rf_repay_max_p66d_clipU3'

tab = fplt.TabOut()
params = {}
for v in ['x_train', 'x_test', 'oot']:
    params.update({v: fplt.PlotUtils().plot_bivar(eval(v), feas=feas, target=target,
                    cut_params=None, title=feas,
                    yaxis='count_rto', pyecharts=False, mark_line=True,
                    draw_lin=False, save_path=None, thred=-1, max_bins=9999,
                    color_bar=['steelblue'], color_line=['red','black'], return_state=False)})
    
tab.get_tab_out(params, box_style='info',column_cnt=5)

VBox(box_style='info', children=(HBox(children=(HBox(children=(Button(description='x_train', layout=Layout(hei…

### str2number

In [28]:
cfg = pd.read_excel(join(cfg_path,'cfg','dp_cfg.xlsx'))
cfg['encoding_type'] = cfg['encoding_type'].apply(lambda x: eval(x) if pd.notnull(x) else np.nan)
print(cfg.shape)
cfg.head(2)

(268, 6)


Unnamed: 0,feas,keep_value,tree_params,flag,encoding_type,comment
0,DENGJI,,,,{'TS': 'TS'},
1,ely_gt0_repay_amt_rf_due_max_p7d,,"{'max_depth': 3, 'min_samples_leaf': 0.02, 'ma...",,,


In [29]:
idx = pd.notnull(cfg['encoding_type'])
str2numbercfg = fp.Str2Number().str2number_fit(x_train, fea_cfg=cfg[idx].set_index('feas').to_dict("index"), 
                                               target=target, save_path=join(cfg_path,'cfg'))

100%|████████████████████████████████████████████████████████████████████████████████████| 6/6 [00:36<00:00,  6.03s/it]


In [30]:
# tmp = pd.read_pickle(join(cfg_path,'cfg','str2numbercfg.pkl'))
# str2numbercfg = dict(zip(list(tmp['feas']),list(tmp['str2numbercfg'])))
x_train = fp.Str2Number().str2number_transform(x_train, str2numbercfg)
x_test = fp.Str2Number().str2number_transform(x_test, str2numbercfg)
oot = fp.Str2Number().str2number_transform(oot, str2numbercfg)
print(x_train.shape,x_test.shape,oot.shape)
print(x_train[target].mean(),x_test[target].mean(),oot[target].mean())
x_train.head(2)

(70224, 800) (30096, 800) (34251, 800)
0.059139325586694005 0.05911084529505582 0.053166330910046425


Unnamed: 0,ely_gt1_repay_amt_rf_repay_max_p7d,ovd_days_rf_due_max_freq_p7d,normal_days_rf_repay_median_p7d,repay_amt_in_table_rf_repay_max_p96d,normal_repay_amt_rf_repay_sum_p96d,ovd_days_rf_due_last_p36d,normal_repay_rto_p8repayment,LK_CUSTOMERLEVEL,normal_days_rf_repay_max_p36d,JY_CREDITMODE,...,province_LeaveOneOutEncoder,province_CatBoostEncoder,级别_TS,级别_TargetEncoder,级别_MEstimateEncoder,级别_JamesSteinEncoder,级别_GLMMEncoder,级别_WOEEncoder,级别_LeaveOneOutEncoder,级别_CatBoostEncoder
0,,0.0,,3261.81,7905.13,0.0,1.0,D1,5.0,,...,0.054348,0.05435,0.055881,0.055881,0.055881,0.056938,-0.053236,-0.058794,0.055881,0.055881
1,0.0,0.0,0.0,3006.23,3006.23,0.0,1.0,D1,0.0,,...,0.054348,0.05435,0.055881,0.055881,0.055881,0.056938,-0.053236,-0.058794,0.055881,0.055881


In [44]:
cat_feas = [c for c in x_train if re.search('_TS$|_selfcut$|Encoder$',c)]
for v in 'x_train','x_test','oot':
    print(f'{v} is nan sum max is {eval(v)[cat_feas].isnull().sum().max()}, cat feas are {list(eval(v)[cat_feas].select_dtypes("object"))}')

x_train is nan sum max is 0, cat feas are []
x_test is nan sum max is 0, cat feas are []
oot is nan sum max is 0, cat feas are ['LK_CUSTOMERLEVEL_TS']


In [46]:
oot['LK_CUSTOMERLEVEL_TS'] = oot['LK_CUSTOMERLEVEL_TS'].replace('A',0)
oot['LK_CUSTOMERLEVEL_TS'].unique()

array([0.05203046, 0.0667806 , 0.08597884, 0.06182491, 0.04183188,
       0.03076923, 0.02898551, 0.03333333, 0.05202312, 0.0625    ,
       0.        ])

In [47]:
fea_cfg = dict(zip(cat_feas, [{'keep_value': np.nan, 'tree_params':
                    {'max_depth': 2, 'min_samples_leaf': 0.05, 'thred': 2}}] * len(cat_feas)))
bins_cfg = fp.get_bins_bfwoe(x_train, fea_cfg=fea_cfg, target=target, save_path=None)

out, x_train = fp.woe_encode(x_train, woe_cfg=bins_cfg, target=target, type_='calc')
out, x_test = fp.woe_encode(x_test, woe_cfg=out, target=target, type_='nocalc')
out, oot = fp.woe_encode(oot, woe_cfg=out, target=target, type_='nocalc')

print(x_train.shape,x_test.shape,oot.shape)
print(x_train[target].mean(),x_test[target].mean(),oot[target].mean())
x_train.head(2)

100%|██████████████████████████████████████████████████████████████████████████████████| 28/28 [00:01<00:00, 19.00it/s]
100%|██████████████████████████████████████████████████████████████████████████████████| 28/28 [00:43<00:00,  1.55s/it]
100%|██████████████████████████████████████████████████████████████████████████████████| 28/28 [00:18<00:00,  1.53it/s]
100%|██████████████████████████████████████████████████████████████████████████████████| 28/28 [00:21<00:00,  1.32it/s]


(70224, 828) (30096, 828) (34251, 828)
0.059139325586694005 0.05911084529505582 0.053166330910046425


Unnamed: 0,ely_gt1_repay_amt_rf_repay_max_p7d,ovd_days_rf_due_max_freq_p7d,normal_days_rf_repay_median_p7d,repay_amt_in_table_rf_repay_max_p96d,normal_repay_amt_rf_repay_sum_p96d,ovd_days_rf_due_last_p36d,normal_repay_rto_p8repayment,LK_CUSTOMERLEVEL,normal_days_rf_repay_max_p36d,JY_CREDITMODE,...,province_LeaveOneOutEncoder_afwoe,province_CatBoostEncoder_afwoe,级别_TS_afwoe,级别_TargetEncoder_afwoe,级别_MEstimateEncoder_afwoe,级别_JamesSteinEncoder_afwoe,级别_GLMMEncoder_afwoe,级别_WOEEncoder_afwoe,级别_LeaveOneOutEncoder_afwoe,级别_CatBoostEncoder_afwoe
0,,0.0,,3261.81,7905.13,0.0,1.0,D1,5.0,,...,-0.13433,-0.13433,-0.053935,-0.053935,-0.053935,-0.053935,-0.053935,-0.053935,-0.053935,-0.053935
1,0.0,0.0,0.0,3006.23,3006.23,0.0,1.0,D1,0.0,,...,-0.13433,-0.13433,-0.053935,-0.053935,-0.053935,-0.053935,-0.053935,-0.053935,-0.053935,-0.053935


In [52]:
feas = 'province_TS_afwoe'

tab = fplt.TabOut()
params = {}
for v in ['x_train', 'x_test', 'oot']:
    params.update({v: fplt.PlotUtils().plot_bivar(eval(v), feas=feas, target=target,
                    cut_params=None, title=feas,
                    yaxis='count_rto', pyecharts=False, mark_line=True,
                    draw_lin=False, save_path=None, thred=-1, max_bins=9999,
                    color_bar=['steelblue'], color_line=['red','black'], return_state=False)})
    
tab.get_tab_out(params, box_style='info',column_cnt=5)

VBox(box_style='info', children=(HBox(children=(HBox(children=(Button(description='x_train', layout=Layout(hei…

In [57]:
cfg = {}
_ = [cfg.update(x_train.groupby('_'.join(list(fea_cfg.keys())[i].split('_')[:-1]), dropna=False)[
                    f'{list(fea_cfg.keys())[i]}_afwoe']. \
                max().sort_values().to_frame(f'{list(fea_cfg.keys())[i]}_afwoe').to_dict('dict')) for i in range(len(fea_cfg))]

cfg

{'DENGJI_TS_afwoe': {'V1': -0.6067725012176793,
  'V2': -0.23587284529649258,
  'V3': 0.009618711245058321,
  'V4': 0.21694175948107897},
 'LK_ADVICE_RATE_TS_afwoe': {'{12=0.00045, 3=0.00045, 6=0.00045, 9=0.00045}': -0.23276645855980185,
  '{12=0.0005, 3=0.0005, 6=0.0005, 9=0.0005}': -0.23276645855980185,
  '{12=0.0006, 3=0.0006, 6=0.0006, 9=0.0006}': -0.02398030297733064,
  '{12=0.00066, 3=0.00066, 6=0.00066, 9=0.00066}': 0.2921304270079355},
 'LK_CREDITRATING_TS_afwoe': {'优质': -0.2100920153620232,
  '次优': -0.2100920153620232,
  '良好': -0.2100920153620232,
  'nan': -0.019234668198445302,
  '稳定': -0.019234668198445302,
  '中等': 0.2921304270079355},
 'LK_CREDITRATING_selfcut_afwoe': {'nan': -0.03693060879826521,
  '优质': -0.03693060879826521,
  '次优': -0.03693060879826521,
  '稳定': -0.03693060879826521,
  '良好': -0.03693060879826521,
  '中等': 0.2921304270079355},
 'LK_CUSTOMERLEVEL_TS_afwoe': {'B': -0.4236921542885925,
  'C1': -0.4236921542885925,
  'C2': -0.4236921542885925,
  'E': -0.4236921

In [58]:
with open(join(cfg_path,'cfg','cwoe_cfg'), 'w') as json_file:
    json.dump(cfg, json_file, ensure_ascii=False)

In [59]:
x_train.to_pickle(join(dpath,'x_train_afproc.pkl'))
x_test.to_pickle(join(dpath,'x_test_afproc.pkl'))
oot.to_pickle(join(dpath,'oot_afproc.pkl'))