In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
import os
import sys
import copy
from typing import (
    Dict, 
    List,
    Any
)

from datetime import datetime


import numpy as np
import pandas as pd

from scipy.stats import uniform
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedShuffleSplit

from joblib import load, dump

import seaborn as sns
import matplotlib.pyplot as plt

In [4]:
sys.path.append(os.path.join(os.getcwd(), '..'))
from utils.metrics import Metrics
from utils.modelling_tools import (
    search_hyper_params_and_log, 
    update_solver_by_penalty, 
    evaluate_model
)
from utils.metrics_by_quantile import MetricsByQuantile

Using `tqdm.autonotebook.tqdm` in notebook mode. Use `tqdm.tqdm` instead to force console mode (e.g. in jupyter console)


In [5]:
PROJECT_PATH: str = os.path.join(os.getcwd(), '..')
DATA_PATH: str = os.path.join(PROJECT_PATH, 'data')
OPT_BIN_PATH: str = os.path.join(os.path.join(PROJECT_PATH, 'objects'), 'optbinners')
MODELS_PATH: str = os.path.join(os.path.join(PROJECT_PATH, 'objects'), 'models')

In [6]:
TRAIN_FROM: datetime.date = pd.to_datetime('2021-12-01').date()
TRAIN_TO: datetime.date = pd.to_datetime('2022-01-01').date()

VAL_FROM: datetime.date = pd.to_datetime('2023-01-01').date()
VAL_TO: datetime.date = pd.to_datetime('2023-05-01').date()

TEST_FROM: datetime.date = pd.to_datetime('2023-05-01').date()
TEST_TO: datetime.date = pd.to_datetime('2023-08-01').date()

In [7]:
# set constants
COLS_TO_USE: List[str] = [
"OPTIN_11SCN_MLOptBin",
"AFF_B_09_01OptBin",
"TRD_A_20OptBin",
"VM01_SP_VM2_15OptBin",
"SP_G_37OptBin",
"E4_Q_17OptBin",
"TRD_C_07OptBin",
"TRD_B_20OptBin",
"SP_B2_18OptBin",
"SPA_F2_33OptBin",
"VM01_SP_VM2_14OptBin",
"VM01_SP_VM2_24OptBin",
"SP_G_38OptBin",
"SP_B1_14OptBin",
"GEN11_SP_N_91OptBin",
"GEN11_SP_EDI_07OptBin",
"ND_ECC_06OptBin",
"TRD_B_18OptBin",
"TRD_B_08OptBin",
"EA5_S_01OptBin",
"CLU_CLI_L6M_NPR_L6MOptBin",
"SP_I_63_01OptBin",
# "VM08_SP_VM2_20OptBin",
# "EA1_D_02OptBin",
"TRD_O_06OptBin",
# "AFF_T_07_01OptBin",
# "VM02_SP_VM1_18OptBin",
# "GEN11_SP_K_80_TOOptBin",
"GEN11_SP_N_92_ABOtherOptBin",
]

In [19]:
TARGET_NAME: str = 'GB6_Flag_2Limit'
TARGET_VALS: List[int] = [0, 1]
TARGET_BIN: str = f'_{TARGET_NAME}_bin'
TARGET_BIN_0isB: str = f'_{TARGET_NAME}_bin_0isB'

PALETTE: Dict[str, str] = {
    TARGET_NAME: {
        'G': 'green',
        'B': 'red',
        'I': 'black'
    },
    TARGET_BIN: {0: 'green', 1: 'red'},
    'DataPart': {'Train+Val': 'black', 'Val': 'blue', 'Test': 'red', 'Train': 'orange'}
}
RANDOM_SEED: int = 42

In [51]:
TRAIN_BIN_FROM: datetime.date = pd.to_datetime('2021-07-01').date()
TRAIN_BIN_TO: datetime.date = pd.to_datetime('2021-12-01').date()

In [9]:
# read data
df: pd.DataFrame = pd.read_csv(os.path.join(DATA_PATH, 'final_aiq2_s_neg_sep_I_excl.csv'), low_memory=False)

In [10]:
print('Number of rows: %.0f; columns: %.0f' % df.shape)

Number of rows: 110462; columns: 1866


# Preprocessing

Convert dates to the corresponding types

In [11]:
for col in ['_RDATE_EOM', '_RDATE']:
    df[col] = pd.to_datetime(df[col]).dt.date

In [12]:
print('Available date range:')
df['_RDATE'].agg(['min', 'max'])

Available date range:


min    2021-06-30
max    2023-11-30
Name: _RDATE, dtype: object

Generate Target, where `GB6_Flag_2Limit` in ('I','B') is '1' and GB6_Flag_2Limit in ('G') is '0'

In [13]:
df[f'_{TARGET_NAME}_bin_0isB'] = df[TARGET_NAME].replace(
    {
        'G': 0,
        'I': 1,
        'B': 1
    }
)

Downcasting behavior in `replace` is deprecated and will be removed in a future version. To retain the old behavior, explicitly call `result.infer_objects(copy=False)`. To opt-in to the future behavior, set `pd.set_option('future.no_silent_downcasting', True)`


In [14]:
pd.merge(
    df[f'_{TARGET_NAME}_bin_0isB'].value_counts().reset_index(name='n'),
    (df[f'_{TARGET_NAME}_bin_0isB'].value_counts(normalize=True) * 100).reset_index(name='%'),
    on=f'_{TARGET_NAME}_bin_0isB', how='outer'
)

Unnamed: 0,_GB6_Flag_2Limit_bin_0isB,n,%
0,0,99444,90.025529
1,1,11018,9.974471


# Split data

In [15]:
df_train = df.loc[
    (df._RDATE >= TRAIN_FROM) &
    (df._RDATE < TRAIN_TO) & 
    (df[TARGET_NAME] != 'I'),
   :
].reset_index(drop=True)

df_train_val = df.loc[
    (df._RDATE >= TRAIN_FROM) &
    (df._RDATE < VAL_TO) &
    (df[TARGET_NAME] != 'I'),
    :
].reset_index(drop=True)

df_val = df.loc[
    (df._RDATE >= VAL_FROM) &
    (df._RDATE < VAL_TO) &
    (df[TARGET_NAME] != 'I'),
    :
].reset_index(drop=True)

df_test_same = df.loc[
    (df._RDATE >= TEST_FROM) &
    (df._RDATE < TEST_TO) &
    (df[TARGET_NAME] != 'I'),
    :
].reset_index(drop=True)

df_test = df.loc[
    (df._RDATE >= TEST_FROM) &
    (df._RDATE < TEST_TO),
    :
].reset_index(drop=True)

In [16]:
print(f'Distribution of _{TARGET_NAME}_bin_0isB on test')
pd.merge(
    df_test[f'_{TARGET_NAME}_bin_0isB'].value_counts().reset_index(name='n'),
    (df_test[f'_{TARGET_NAME}_bin_0isB'].value_counts(normalize=True) * 100).reset_index(name='%'),
    on=f'_{TARGET_NAME}_bin_0isB', how='outer'
)

Distribution of _GB6_Flag_2Limit_bin_0isB on test


Unnamed: 0,_GB6_Flag_2Limit_bin_0isB,n,%
0,0,13663,84.126593
1,1,2578,15.873407


In [17]:
data_dict = {
    "df_train": df_train,
    "df_val": df_val,
    "df_train_val": df_train_val, 
    "df_test_same": df_test_same,
    "df_test": df_test
}

In [20]:
# randomly select N observations from test data
N = 10000
sss = StratifiedShuffleSplit(n_splits=1, train_size=N, random_state=RANDOM_SEED)
indx = sss.split(data_dict['df_test_same'].reset_index(drop=True), data_dict['df_test_same'][TARGET_BIN])
random_indx = list(indx)[0][0]

assert len(random_indx) == N, "Incorrect number of elements in the subsample"
test_sample = data_dict['df_test_same'].reset_index(drop=True).iloc[random_indx, :].reset_index(drop=True)

In [21]:
data_dict['df_test_same_sample'] = test_sample

# Run the experiment

In [24]:
with open(os.path.join(MODELS_PATH, '[Model][Modelling] [Final_AIQ2-50] [I excluded] LogRegression - woe-2 - feature selection - 3 (final).jblb'), 'rb') as f:
    model_obj = load(f)

In [25]:
with open(os.path.join(MODELS_PATH, '[StdScaler][Modelling] [Final_AIQ2-50] [I excluded] LogRegression - woe-2 - feature selection - 3 (final).jblb'), 'rb') as f:
    std_scaler=load(f)

In [31]:
np.concatenate((model_obj.coef_[0], model_obj.intercept_))

array([-1.24901939, -0.05524496, -0.0623808 , -0.05720196, -0.09008615,
        0.072927  , -0.14624046, -0.05058227, -0.0895822 , -0.20493718,
        0.16808272,  0.05093944, -0.06106494, -0.04644882, -0.08179527,
        0.10447431,  0.17728551,  0.06473774,  0.06599281, -0.13022312,
       -0.10635291, -0.17348975,  0.10107063, -0.08318623, -0.88757256])

In [35]:
df_coeff = pd.DataFrame(
    {
        'Feature': COLS_TO_USE + ['Intercept'],
        'Coeff': np.concatenate((model_obj.coef_[0], model_obj.intercept_))
    }
)

In [36]:
df_coeff

Unnamed: 0,Feature,Coeff
0,OPTIN_11SCN_MLOptBin,-1.249019
1,AFF_B_09_01OptBin,-0.055245
2,TRD_A_20OptBin,-0.062381
3,VM01_SP_VM2_15OptBin,-0.057202
4,SP_G_37OptBin,-0.090086
5,E4_Q_17OptBin,0.072927
6,TRD_C_07OptBin,-0.14624
7,TRD_B_20OptBin,-0.050582
8,SP_B2_18OptBin,-0.089582
9,SPA_F2_33OptBin,-0.204937


In [37]:
pd.DataFrame(
    {
        'Feature': std_scaler.feature_names_in_,
        'mean': std_scaler.mean_,
        'std': std_scaler.scale_
    }
    
)

Unnamed: 0,Feature,mean,std
0,OPTIN_11SCN_MLOptBin,0.94843,1.539048
1,AFF_B_09_01OptBin,0.450239,1.162333
2,TRD_A_20OptBin,0.295816,0.949009
3,VM01_SP_VM2_15OptBin,0.529894,1.207681
4,SP_G_37OptBin,0.330991,0.907506
5,E4_Q_17OptBin,0.346699,0.764723
6,TRD_C_07OptBin,0.230825,0.81079
7,TRD_B_20OptBin,0.3221,0.77966
8,SP_B2_18OptBin,0.243233,0.726863
9,SPA_F2_33OptBin,0.231387,0.857163


In [38]:
pd.concat(
    (
        df[['TRD_A_20', 'TRD_A_20OptBin', 'BinTRD_A_20']].reset_index(drop=True), 
        pd.DataFrame(std_scaler.transform(df[COLS_TO_USE]), columns=[col+'Std' for col in COLS_TO_USE]).reset_index(drop=True)[['TRD_A_20OptBinStd']]
    ), axis=1
)

Unnamed: 0,TRD_A_20,TRD_A_20OptBin,BinTRD_A_20,TRD_A_20OptBinStd
0,21,0.000000,"[1, inf)",-0.311710
1,-2,-0.515010,-2,-0.854392
2,-2,-0.515010,-2,-0.854392
3,-3,1.436451,"{-3, 0}",1.201922
4,-2,-0.515010,-2,-0.854392
...,...,...,...,...
110457,-2,-0.515010,-2,-0.854392
110458,-2,-0.515010,-2,-0.854392
110459,-2,-0.515010,-2,-0.854392
110460,-3,1.436451,"{-3, 0}",1.201922


In [39]:
df.head()

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,UNIQUE_ID,E1_A_01,E1_A_02,E1_A_03,E1_A_04,E1_A_05,E1_A_06,E1_A_07,...,BinE4_Q_17,GEN11_SP_N_92_ABOther,GEN11_SP_N_92_ABOtherOptBin,BinGEN11_SP_N_92_ABOther,GEN11_SP_K_80_JSOther,GEN11_SP_K_80_TO,GEN11_SP_K_80_TOOptBin,BinGEN11_SP_K_80_TO,GEN11_SP_K_80_PR,_GB6_Flag_2Limit_bin_0isB
0,0,0,44,0,0,0,0,0,0,3,...,2,B,0.883575,{'B'},Sole,G,-0.213474,"[J, I, S, U, G, F]",G,0
1,1,1,297,0,0,0,0,0,0,9,...,2,B,0.883575,{'B'},(Joint)OR(Sole&Joint),1234,0.788473,"{1, 2, 3, 4}",135,0
2,2,2,759,0,0,0,0,0,0,5,...,{6},Other,-0.59221,{'Other'},Sole,F,-0.213474,"[J, I, S, U, G, F]",F,0
3,3,3,760,0,0,0,3,19,20,6,...,"{3, 4, 7, 8}",B,0.883575,{'B'},Sole,G,-0.213474,"[J, I, S, U, G, F]",G,0
4,4,4,816,3,29,21,8,22,65,6,...,"{3, 4, 7, 8}",A,-0.956089,{'A'},(Joint)OR(Sole&Joint),1234,0.788473,"{1, 2, 3, 4}",246,0


In [40]:
pd.concat(
    (
        df[['TRD_A_20', 'TRD_A_20OptBin', 'BinTRD_A_20']].reset_index(drop=True), 
        pd.DataFrame(std_scaler.transform(df[COLS_TO_USE]), columns=[col+'Std' for col in COLS_TO_USE]).reset_index(drop=True)[['TRD_A_20OptBinStd']]
    ), axis=1
).head(1)

Unnamed: 0,TRD_A_20,TRD_A_20OptBin,BinTRD_A_20,TRD_A_20OptBinStd
0,21,0.0,"[1, inf)",-0.31171


In [41]:
model_obj.predict_proba(std_scaler.transform(df[df.UNIQUE_ID == 44][COLS_TO_USE]))[:, 1] 

array([0.25381008])

In [42]:
model_obj.predict_proba(std_scaler.transform(df[df.UNIQUE_ID == 44][COLS_TO_USE]))[:, 1] / model_obj.predict_proba(std_scaler.transform(df[df.UNIQUE_ID == 44][COLS_TO_USE]))[:, 0]

array([0.3401414])

In [43]:
df_coeff = df_coeff.set_index('Feature')

In [44]:
COLS_TO_USE.index('TRD_A_20OptBin')

2

In [45]:
std_scaler.transform(df[df.UNIQUE_ID == 44][COLS_TO_USE])

array([[ 0.30929647,  1.66774105, -0.31171036,  1.38723178,  0.06095715,
         0.72484521, -0.279504  ,  0.82179964, -0.57931539, -0.65032814,
         0.68335245, -0.22064468, -0.70136616, -0.3051029 ,  0.47633919,
         0.13872463,  0.1450299 , -1.21795392, -0.22807867, -0.57370458,
        -0.44442953,  0.37655354,  0.5858444 ,  0.81905616]])

In [46]:
df_coeff.loc[COLS_TO_USE, 'Coeff'].values

array([-1.24901939, -0.05524496, -0.0623808 , -0.05720196, -0.09008615,
        0.072927  , -0.14624046, -0.05058227, -0.0895822 , -0.20493718,
        0.16808272,  0.05093944, -0.06106494, -0.04644882, -0.08179527,
        0.10447431,  0.17728551,  0.06473774,  0.06599281, -0.13022312,
       -0.10635291, -0.17348975,  0.10107063, -0.08318623])

In [47]:
df_coeff.loc[COLS_TO_USE, 'Coeff'].values * std_scaler.transform(df[df.UNIQUE_ID == 44][COLS_TO_USE])

array([[-0.38631729, -0.09213429,  0.01944474, -0.07935238, -0.00549139,
         0.05286079,  0.04087479, -0.04156849,  0.05189635,  0.13327641,
         0.11485974, -0.01123952,  0.04282888,  0.01417167, -0.03896229,
         0.01449316,  0.0257117 , -0.07884759, -0.01505155,  0.0747096 ,
         0.04726637, -0.06532818,  0.05921167, -0.0681342 ]])

In [48]:
np.exp(df_coeff.loc[COLS_TO_USE, 'Coeff'].values * std_scaler.transform(df[df.UNIQUE_ID == 44][COLS_TO_USE]))

array([[0.67955488, 0.91198267, 1.01963502, 0.92371437, 0.99452366,
        1.05428287, 1.04172167, 0.95928363, 1.05326656, 1.14256577,
        1.12171609, 0.98882341, 1.04375928, 1.01427256, 0.96178697,
        1.01459869, 1.0260451 , 0.92418077, 0.98506116, 1.07757118,
        1.04840124, 0.93675999, 1.06099979, 0.93413511]])

In [49]:
np.exp(model_obj.intercept_[0])

0.4116538067866693

In [50]:
np.exp(np.sum(df_coeff.loc[COLS_TO_USE, 'Coeff'].values * std_scaler.transform(df[df.UNIQUE_ID == 44][COLS_TO_USE]))) * np.exp(model_obj.intercept_[0])

0.3401414023159516

In [55]:
df_train_bin = df[
    (df._RDATE >= TRAIN_BIN_FROM) &
    (df._RDATE < TRAIN_BIN_TO) & 
    (df[TARGET_NAME] != 'I')
].reset_index(drop=True)

In [56]:
df_train_val = df[
    (df._RDATE >= TRAIN_FROM) &
    (df._RDATE < VAL_TO) & 
    (df[TARGET_NAME] != 'I')
].reset_index(drop=True)

In [57]:
df_test = df[
    (df._RDATE >= TEST_FROM) &
    (df._RDATE < TEST_TO) & 
    (df[TARGET_NAME] != 'I')
].reset_index(drop=True)

## Sample for bins

In [93]:
def count_obs_per_bin(data: pd.DataFrame, columns: List[str]) -> pd.DataFrame:
    df_bin_sts = pd.DataFrame()
    for col_name in [f'Bin{col[:-len('OptBin')]}' for col in columns]:
        temp = data[col_name].value_counts().reset_index(name='n')
        temp = temp.rename(columns={col_name: 'Bin'})
        temp['Feature'] = col_name[len('Bin'):]
        df_bin_sts = pd.concat((df_bin_sts, temp))
    return df_bin_sts

In [94]:
df_train_bin_sts = count_obs_per_bin(data=df_train_bin, columns=COLS_TO_USE)

In [95]:
df_train_bin_sts[df_train_bin_sts['Feature'] == 'TRD_A_20']

Unnamed: 0,Bin,n,Feature
0,-2,7457,TRD_A_20
1,"{-3, 0}",5977,TRD_A_20
2,"[1, inf)",465,TRD_A_20
3,-1,41,TRD_A_20


In [96]:
def count_target_per_bin(data: pd.DataFrame, target_name: str, columns: List[str]) -> pd.DataFrame:
    df_bin_sts = pd.DataFrame()
    for col_name in [f'Bin{col[:-len('OptBin')]}' for col in columns]:
        temp = data.groupby(target_name)[col_name].value_counts().reset_index(name='n').pivot(index=col_name, columns=target_name, values='n')
        temp = temp.reset_index()
        temp = temp.rename(columns={col_name: 'Bin'})
        temp['Feature'] = col_name[len('Bin'):]
        df_bin_sts = pd.concat((df_bin_sts, temp))
    df_bin_sts = df_bin_sts.fillna(0)
    return df_bin_sts

In [97]:
df_train_bin_sts_t = count_target_per_bin(data=df_train_bin, target_name=TARGET_NAME, columns=COLS_TO_USE)
df_train_bin_sts = pd.merge(df_train_bin_sts_t, df_train_bin_sts, on=['Feature', 'Bin'])

In [99]:
df_train_bin_sts

Unnamed: 0,Bin,B,G,Feature,n
0,"(-inf, 684.50)",182.0,1145.0,OPTIN_11SCN_ML,1327
1,-998.0,0.0,48.0,OPTIN_11SCN_ML,48
2,"[1025.50, 1103.50)",16.0,2084.0,OPTIN_11SCN_ML,2100
3,"[1103.50, 1132.50)",2.0,822.0,OPTIN_11SCN_ML,824
4,"[1132.50, inf)",6.0,3861.0,OPTIN_11SCN_ML,3867
...,...,...,...,...,...
172,"[10.00, inf)",28.0,2294.0,TRD_O_06,2322
173,{'A'},83.0,1000.0,GEN11_SP_N_92_ABOther,1083
174,{'B'},110.0,8342.0,GEN11_SP_N_92_ABOther,8452
175,{'Other'},238.0,4126.0,GEN11_SP_N_92_ABOther,4364


In [103]:
assert not df_train_bin_sts[['Bin', 'Feature', 'n']].duplicated().any()
assert not df_train_bin_sts[['Bin', 'Feature', 'B', 'G']].duplicated().any()
del df_train_bin_sts_t

In [105]:
df_train_bin_sts['Event rate'] = df_train_bin_sts['B'] / df_train_bin_sts['n']

In [106]:
df_train_bin_sts[df_train_bin_sts.Feature == 'TRD_A_20']

Unnamed: 0,Bin,B,G,Feature,n,Event rate
20,-1,0.0,41.0,TRD_A_20,41,0.0
21,-2,378.0,7079.0,TRD_A_20,7457,0.050691
22,"[1, inf)",8.0,457.0,TRD_A_20,465,0.017204
23,"{-3, 0}",45.0,5932.0,TRD_A_20,5977,0.007529


In [122]:
df_train_bin_sts.to_csv(os.path.join(DATA_PATH, 'woe_sample_for_bins.csv'), index=False)

In [124]:
(df_train_bin[TARGET_NAME] == 'B').sum()

431

In [125]:
(df_train_bin[TARGET_NAME] == 'G').sum()

13509

## Train + val

In [107]:
df_train_val_bin_sts = count_obs_per_bin(data=df_train_val, columns=COLS_TO_USE)
df_train_val_bin_sts_t = count_target_per_bin(data=df_train_val, target_name=TARGET_NAME, columns=COLS_TO_USE)
df_train_val_bin_sts = pd.merge(df_train_val_bin_sts_t, df_train_val_bin_sts, on=['Feature', 'Bin'])

assert not df_train_val_bin_sts[['Bin', 'Feature', 'n']].duplicated().any()
assert not df_train_val_bin_sts[['Bin', 'Feature', 'B', 'G']].duplicated().any()
del df_train_val_bin_sts_t


df_train_val_bin_sts['Event rate'] = df_train_val_bin_sts['B'] / df_train_val_bin_sts['n']

In [109]:
df_train_val_bin_sts[df_train_val_bin_sts.Feature == 'TRD_A_20']

Unnamed: 0,Bin,B,G,Feature,n,Event rate
21,-1,4.0,205.0,TRD_A_20,209,0.019139
22,-2,2148.0,32898.0,TRD_A_20,35046,0.061291
23,"[1, inf)",79.0,1761.0,TRD_A_20,1840,0.042935
24,"{-3, 0}",278.0,25166.0,TRD_A_20,25444,0.010926


In [111]:
df[
    (df._RDATE >= TRAIN_FROM) & 
    (df._RDATE < VAL_TO) & 
    (df[TARGET_NAME] == 'B') & 
    (df.TRD_A_20 == -1)
].shape

(4, 1867)

In [114]:
df[
    (df._RDATE >= TRAIN_FROM) & 
    (df._RDATE < VAL_TO) & 
    (df[TARGET_NAME] != 'I') & 
    # (df[TARGET_NAME] == 'B') & 
    (df.TRD_A_20 == -1)
].shape

(209, 1867)

In [115]:
df_train_val_bin_sts.to_csv(os.path.join(DATA_PATH, 'woe_train_val.csv'), index=False)

In [126]:
df_train_val[TARGET_NAME].value_counts()

GB6_Flag_2Limit
G    60030
B     2509
Name: count, dtype: int64

## Test

In [116]:
df_test_bin_sts = count_obs_per_bin(data=df_test_same, columns=COLS_TO_USE)
df_test_bin_sts_t = count_target_per_bin(data=df_test, target_name=TARGET_NAME, columns=COLS_TO_USE)
df_test_bin_sts = pd.merge(df_test_bin_sts_t, df_test_bin_sts, on=['Feature', 'Bin'])

assert not df_test_bin_sts[['Bin', 'Feature', 'n']].duplicated().any()
assert not df_test_bin_sts[['Bin', 'Feature', 'B', 'G']].duplicated().any()
del df_test_bin_sts_t


df_test_bin_sts['Event rate'] = df_test_bin_sts['B'] / df_test_bin_sts['n']

In [117]:
df_test_bin_sts[df_test_bin_sts.Feature == 'TRD_A_20']

Unnamed: 0,Bin,B,G,Feature,n,Event rate
21,-1,4.0,45.0,TRD_A_20,49,0.081633
22,-2,1179.0,8267.0,TRD_A_20,9446,0.124815
23,"[1, inf)",36.0,352.0,TRD_A_20,388,0.092784
24,"{-3, 0}",115.0,4999.0,TRD_A_20,5114,0.022487


In [118]:
df[
    (df._RDATE >= TEST_FROM) & 
    (df._RDATE < TEST_TO) & 
    (df[TARGET_NAME] == 'B') & 
    (df.TRD_A_20 == -1)
].shape

(4, 1867)

In [120]:
df[
    (df._RDATE >= TEST_FROM) & 
    (df._RDATE < TEST_TO) & 
    (df[TARGET_NAME] != 'I') & 
    # (df[TARGET_NAME] == 'B') & 
    (df.TRD_A_20 == -1)
].shape

(49, 1867)

In [121]:
df_test_bin_sts.to_csv(os.path.join(DATA_PATH, 'woe_test.csv'), index=False)

In [127]:
df_test_same[TARGET_NAME].value_counts()

GB6_Flag_2Limit
G    13663
B     1334
Name: count, dtype: int64

In [128]:
df_test_bin_sts[(df_test_bin_sts.Feature == 'OPTIN_11SCN_ML') & (df_test_bin_sts.Bin == "[859.50, 955.50)")]

Unnamed: 0,Bin,B,G,Feature,n,Event rate
9,"[859.50, 955.50)",175.0,2078.0,OPTIN_11SCN_ML,2253,0.077674


In [133]:
175.0 * 100 /1334

13.118440779610195

In [139]:
2078.0 * 100 / 13663

15.208958501061261

In [140]:
np.log(15.208958501061261/13.118440779610195)

0.14786569587273984

In [129]:
df_train_val_bin_sts[(df_train_val_bin_sts.Feature == 'OPTIN_11SCN_ML') & (df_test_bin_sts.Bin == "[859.50, 955.50)")]

Unnamed: 0,Bin,B,G,Feature,n,Event rate
9,"[859.50, 955.50)",337.0,8418.0,OPTIN_11SCN_ML,8755,0.038492


In [134]:
337.0* 100 /2509

13.431646074133122

In [137]:
8418.0 * 100 /60030

14.022988505747126

In [142]:
np.log(14.022988505747126/13.431646074133122)

0.04308444909868764

In [131]:
df_train_bin_sts[(df_train_bin_sts.Feature == 'OPTIN_11SCN_ML') & (df_train_bin_sts.Bin == "[859.50, 955.50)")]

Unnamed: 0,Bin,B,G,Feature,n,Event rate
8,"[859.50, 955.50)",34.0,1627.0,OPTIN_11SCN_ML,1661,0.02047


In [136]:
34*100/431

7.888631090487239

In [143]:
1627.0*100/13509

12.043822636760678

In [144]:
np.log(12.043822636760678/7.888631090487239)

0.4231292637369428