# Load data

In [5]:
import pandas as pd
import numpy as np
from time import time
# https://docs.google.com/spreadsheets/d/1LdpV5SxqbSiDta9eB44iisqxdSUNqH1GiwJx9tY2o1I/edit?usp=sharing
import warnings
warnings.filterwarnings("ignore") ##忽略警告

In [6]:
code = "1LdpV5SxqbSiDta9eB44iisqxdSUNqH1GiwJx9tY2o1I"
read_df = pd.read_csv("https://docs.google.com/spreadsheets/d/" + code + "/gviz/tq?tqx=out:csv")


In [7]:
sample_df = read_df.iloc[:,2:24].copy()
lst_cat = ["vehicle_year", "vehicle_make","bankruptcy_ind", "used_ind"] 

In [8]:
# treat tot_tr as having some -99xxxx values
sample_df.tot_tr[0:30] = -999901
sample_df.tot_tr[31:90] = -999902
sample_df.tot_tr[91:100] = -999999
sample_df.tot_tr[100:180] = np.nan
sample_df.tot_tr[200:280] = None

# tot_rev_debt as having 1 -99xxxx value
sample_df.tot_rev_debt[00:30] = -999999
sample_df.tot_rev_debt[31:80] = np.nan

# insert some nan to vehicle make
sample_df.vehicle_make[31:80] = 'nan'
sample_df.vehicle_make[81:120] = None

# treat used_ind as highly skewed data
sample_df.used_ind[0:5000] = 2
sample_df.used_ind[5000:5500] = 1

# treat tot_derog , and age_oldest_tr and the rest as normal continuous feature

sample_df

Unnamed: 0,bad_ind,vehicle_year,vehicle_make,bankruptcy_ind,tot_derog,tot_tr,age_oldest_tr,tot_open_tr,tot_rev_tr,tot_rev_debt,tot_rev_line,rev_util,fico_score,purch_price,msrp,down_pyt,loan_term,loan_amt,ltv,tot_income,veh_mileage,used_ind
0,1,1998.0,FORD,N,7.0,-999901.0,64.0,2.0,1.0,-999999.0,500.0,101,650.0,17200.00,17350.0,0.00,36,17200.00,99.0,6550.00,24000.0,2
1,0,2000.0,DAEWOO,N,0.0,-999901.0,240.0,11.0,7.0,-999999.0,57241.0,60,649.0,19588.54,19788.0,683.54,60,19588.54,99.0,4666.67,22.0,2
2,1,1998.0,PLYMOUTH,N,7.0,-999901.0,60.0,,,-999999.0,,0,613.0,13595.00,11450.0,0.00,60,10500.00,92.0,2000.00,19600.0,2
3,1,1997.0,FORD,N,3.0,-999901.0,35.0,5.0,4.0,-999999.0,5946.0,68,603.0,12999.00,12100.0,3099.00,60,10800.00,118.0,1500.00,10000.0,2
4,0,2000.0,TOYOTA,N,0.0,-999901.0,104.0,2.0,0.0,-999999.0,1800.0,0,764.0,26328.04,22024.0,0.00,60,26328.04,122.0,4144.00,14.0,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5840,0,1997.0,PORSCHE,N,0.0,21.0,417.0,4.0,2.0,1859.0,52200.0,4,801.0,0.00,31000.0,0.00,36,31000.00,100.0,5000.00,45000.0,1
5841,0,2000.0,TOYOTA,Y,2.0,8.0,62.0,5.0,3.0,4992.0,5066.0,99,628.0,24970.00,22024.0,0.00,60,24970.00,117.0,2400.00,21.0,0
5842,0,1997.0,CHEVROLET,N,0.0,6.0,30.0,4.0,3.0,972.0,5616.0,17,735.0,20949.00,18950.0,0.00,36,20949.00,113.0,1837.50,25000.0,1
5843,0,1999.0,MERCURY,N,0.0,9.0,67.0,7.0,5.0,13714.0,14061.0,98,737.0,22400.00,28700.0,5300.00,48,17100.00,60.0,28000.00,0.0,0


In [9]:
## define some NA values by user, both works
NA_dict = {"vehicle_year":["1998.0"], "vehicle_make":["FORD","B50"]}
NA_list = [-999901,-999902,-999999,-990001]

# Class

In [1]:
## New OO Helper with cleaned code

import os
import pandas as pd
import numpy as np
from time import time as now
from scipy.stats import chi2, chisquare
import math
from sklearn.base import TransformerMixin


class VarBinHelper(TransformerMixin):

    def __init__(self, **kwargs):
        ## initialise the object with name of label column, min_sample, min_bin_num
        self.min_sample = kwargs.get('min_sample', 0.02)
        self.min_bin = kwargs.get('min_bin', 2)
        self.max_bin = kwargs.get('max_bin', 10)
        self.chimerge_threshold = kwargs.get("chimerge_threshold", chi2.ppf(0.95, 1))
        self.label = kwargs.get('label', None)
        self._fit = False
        self.missing_values_found = {} ## will be a dict
        
        ## to make the class interface same as other versions from the team
        self.categorical_features = None ## updated in fit() 
        self.numerical_features = None ## updated in fit() 
        self.woe_encoder = None  ## model
        self.dict_binlist = None ## model.bin_info
        

    def set_chimerge_threshold(self, p=0.95, df=1):
        self.chimerge_threshold = chi2.ppf(p, df)

    def init_cat_bin(self, sr_feature, y, min_sample=0.01, **kwargs):
        ## put each outcome as 1 bin, rank by bad_rate, merge small bins with the neighbor with closest bad_rate
        ## assume all categorical values are string, including year eg. "2020"
        method = kwargs.get('method', "chi_merge")
        min_bin_size = kwargs.get("min_bin_size", 5)
        multi_missing = kwargs.get("multi_missing", False)
        dict_na = kwargs.get('missing_values', {}) 
        merge_category = kwargs.get("merge_category", True)
        init_merge_small_bin = kwargs.get('init_merge_small_bin', True)

        feature_name = sr_feature.name
        if type(dict_na) == list:
            lst_na = dict_na
        else:
            lst_na = dict_na.get(feature_name, ['nan', None]) 
        
        # decide bin_size (min sample in a bin)
        df = pd.concat([sr_feature, y], axis=1)
        if min_sample > 1:  ## find the size of bin
            bin_size = int(max(min_sample, min_bin_size))
        else:
            bin_size = int(max(min_sample * len(sr_feature), min_bin_size))

        # initialise each value as 1 bin
        lst_unique = sr_feature.unique().tolist()
        df_bin_interval = pd.DataFrame(columns=['bin', 'total', 'total_rate', 'bad', 'bad_rate'], index=list(range(len(lst_unique))))
        df_bin_interval.bin = lst_unique

        # calculate total, total_rate, bad, bad_rate for each bin
        for idx, row in df_bin_interval.iterrows():
            row.bin = [row.bin]
            row.total = df[sr_feature.name].isin(df_bin_interval.loc[idx, 'bin']).sum()
            row.total_rate = row.total / len(sr_feature)
            row.bad = len(df.loc[(df[sr_feature.name].isin(row.bin)) & (df[y.name] == 1)])
            row.bad_rate = row.bad / row.total

        # separates NA values as unique bins
        if multi_missing is not None:
            
            ## determine what NA values exist in this series
            if np.nan in lst_na:
                lst_na.remove(np.nan)
            if 'nan' not in lst_na:
                lst_na.append('nan') ## because sr_feature is passed in as df['feature_name].astype(str), we can only find "nan"
            if None not in lst_na:
                lst_na.append(None)

            lst_na_exist = list(set(lst_na) & set(lst_unique)) ## use set interscetion because lst_na might have values not in lst_unique
            self.missing_values_found[feature_name] = lst_na_exist

            if list(set(lst_na) - set(lst_na_exist)):
                print("NA values ", list(set(lst_na) - set(lst_na_exist)), " not found in ", sr_feature.name)
            
            # put NA bins' index in list, use .loc() to extract, then drop them from df_bin_interval
            lst_na_idx = list()
            for na_value in lst_na_exist:  
                lst_na_idx.append(df_bin_interval.loc[df_bin_interval.bin.apply(lambda x: x == [na_value])].index[0])

            df_na_bin = df_bin_interval.loc[lst_na_idx]
                        
            if multi_missing == False and len(lst_na_exist)>0:
                df_temp = pd.DataFrame(columns=['bin', 'total', 'total_rate', 'bad', 'bad_rate'])
                df_temp.bin = [lst_na_exist]
                df_temp.total[0] = df_na_bin.total.sum()
                df_temp.bad[0] = df_na_bin.bad.sum()
                df_temp.total_rate = df_temp.total / len(sr_feature)
                df_temp.bad_rate = df_temp.bad / df_temp.total
                df_na_bin = df_temp

            df_bin_interval = df_bin_interval.drop(index=lst_na_idx)

        df_bin_interval = df_bin_interval.sort_values(by=['bad_rate']).reset_index(drop=True)

        # merge small bins < bin_size for certian methods
        if init_merge_small_bin == True or merge_category == True: # and merge_category == True
            df_bin_interval = self.merge_small_cat_bins(df_bin_interval, bin_size)

        return df_na_bin, df_bin_interval

    def merge_cat_bin(self, df_bin_interval, idx_left, idx_right):
        bin_left = df_bin_interval.loc[idx_left]
        bin_right = df_bin_interval.loc[idx_right]
        bin_left.bad += bin_right.bad
        bin_left.total += bin_right.total
        bin_left.bad_rate = bin_left.bad / bin_left.total
        bin_left.bin += bin_right.bin
        df_bin_interval = df_bin_interval.drop(idx_right).reset_index(drop=True)
        return df_bin_interval
    
    def merge_small_cat_bins(self, df_bin_interval, bin_size):

        ## choose the best neighbor(left vs right) to merge, based on bad_rate similarity
        while df_bin_interval.total.min() < bin_size:
            idx = df_bin_interval.total.astype(int).idxmin()
            if idx == 0:
                ## left most bin, no choice, merge with right neighbor
                df_bin_interval = self.merge_cat_bin(df_bin_interval, idx, idx + 1)
            elif idx == len(df_bin_interval) - 1:
                ## right most bin, merge with left neighbor
                df_bin_interval = self.merge_cat_bin(df_bin_interval, idx - 1, idx)
            else:
                bad_rate = df_bin_interval.bad_rate[idx]
                bad_rate_right = df_bin_interval.bad_rate[idx + 1]
                bad_rate_left = df_bin_interval.bad_rate[idx - 1]
                diff_left = bad_rate - bad_rate_left
                diff_right = bad_rate_right - bad_rate
                merge_right = diff_right < diff_left  ## True False but used as 1 and 0 in the next line, to decide where to merge
                df_bin_interval = self.merge_cat_bin(df_bin_interval, idx - 1 + merge_right, idx + merge_right)
        
        return df_bin_interval

    def calc_chi2_cat(self, df_bin_interval):
        ## only being called once for each feature, find chi2 the first time.
        ## No kwargs to pass in
        total_count = df_bin_interval.total.sum()
        total_bad = df_bin_interval.bad.sum()
        total_good = total_count - total_bad

        ## initialise the df to return
        cols = ["bin", "sample_count", "bad_count", "good_count", "bad_rate", "bad_count_exp",
                "good_count_exp", "chi2", "chi2_after_merge_with_left"]
        df = pd.DataFrame(columns=cols)
        df.bin = df_bin_interval.bin
        df.sample_count = df_bin_interval.total
        df.bad_count = df_bin_interval.bad
        df.bad_rate = df_bin_interval.bad_rate

        ## find chi2 related stats for each bin(row)
        for index, row in df.iterrows():
            row.good_count = row.sample_count - row.bad_count
            row.bad_count_exp = (row.sample_count) / total_count * total_bad
            row.good_count_exp = (row.sample_count) / total_count * total_good
            row.chi2 = chisquare([row.bad_count, row.good_count], f_exp=[row.bad_count_exp, row.good_count_exp])[0]
            if index > 0:
                row.chi2_after_merge_with_left = row.chi2 + df.chi2[index - 1]

        return df

    def init_cont(self, sr_feature, y, **kwargs):

        ## missing value handling --> default is 1 single bin!
        ## min_sample < 1 means each bin has same proprtion (eg. 0.05) of all samples.
        ## min_bin_size -->  optional, dfaut = 5
        ## prioritise min_sample --> is must have
        ## if dont fulfill, error
        ## >1 means each bin has fixed number of samples
        min_sample = kwargs.get("min_sample", self.min_sample)
        min_bin_size = kwargs.get("min_bin_size", 5) # min sample count in a bin
        multi_missing = kwargs.get("multi_missing", False)
        init_method = kwargs.get("init_method", "quantile")
        dict_na = kwargs.get('missing_values', {})
        init_merge_small_bin = kwargs.get('init_merge_small_bin', True)
        feature_name = sr_feature.name
        # sr_feature[sr_feature.isna()] = np.nan ## set all the NAs to np.nan

        if type(dict_na) == list:
            lst_na = dict_na
        else:
            lst_na = dict_na.get(feature_name, []) 
        # print("----- 186",lst_na)
        ## find the size of bin
        if min_sample > 1:  
            bin_size = int(max(min_sample, min_bin_size))
        else:
            bin_size = int(max(min_sample * len(sr_feature), min_bin_size))

        ## sort the varibale for later binning, not using unique values because we are doing same frequency
        sr_feature_sorted = sr_feature.sort_values().reset_index(drop=True).copy()

        ## if choose separate bin for missing value, add np.nan as a bin, and each value <= -99000 as a bin
        if multi_missing is not None:
                    
            if np.nan not in lst_na:
                lst_na.append(np.nan)
            # print("----- 203",lst_na) # if float("nan") not in lst_na:
            #     lst_na.append(float("nan"))
            array_feature_unique = sr_feature_sorted.unique()
            array_possible_na = array_feature_unique[array_feature_unique <= -990000] # eg 990001 990003
            # print("----- 207",array_possible_na)
            
            if dict_na:
                for na_val in array_possible_na.tolist():
                    if na_val not in lst_na:
                        print(na_val," found in feature:",feature_name,", but not specified in missing_values.")

            lst_na = list(set(lst_na).union(set(array_possible_na.tolist())))

            lst_na_lst = list()
            for na_value in lst_na:
                if na_value == "nan" or na_value is None:
                    na_value = np.nan
                    if np.nan in lst_na:
                        continue
                else:
                    lst_na_lst.append([na_value])

            sr_feature_sorted = sr_feature_sorted.dropna()
            sr_feature_sorted = sr_feature_sorted[sr_feature_sorted > -990000].reset_index(drop = True)

        ## find the target count of bins for normal bins
        target_bin_count = len(sr_feature_sorted) / bin_size

        idx = bin_size - 1  ## initialise the running index to look at first cut point
        lst_bin_interval = list()
        lst_bin_up = list()
        lst_bin_low = [-990000]  ## first lower bound is -inf

        ## if unique value is smaller than target_bin_count, each is 1 bin
        if (sr_feature_sorted.nunique() < target_bin_count):
            for cur_val in sr_feature_sorted.unique().tolist():
                lst_bin_interval.append(pd.Interval(left = lst_bin_low[-1:][0], right = cur_val, closed = 'right'))
                lst_bin_up.append(cur_val)
                lst_bin_low.append(cur_val)

        # initialise with equal frequency
        elif init_method == "quantile":
            ##  Start Binning. Jump every <bin_size> in the sorted X array to record cut points
            while idx < len(sr_feature_sorted):
                cur_val = sr_feature_sorted.loc[idx]
                ##  every bin_low is exclusive, bin_up is inclusive, interval like (low,up]
                ## prevent having intervals like (x,x], which is empty bin
                if cur_val in lst_bin_up:
                    ## change idx to point to next new value
                    try: ## only error is when last unique value count is larger then bin_size
                        idx = sr_feature_sorted[sr_feature_sorted > cur_val].index[0]
                        continue
                    except:
                        pass

                if cur_val not in lst_bin_up:
                    if  not math.isnan(cur_val):
                        lst_bin_interval.append(pd.Interval(left = lst_bin_low[-1], right = cur_val, closed = 'right'))
                        lst_bin_up.append(cur_val)
                        lst_bin_low.append(cur_val)
                    
                ## inspect the next value in sr_feature_sorted after <bin_size>
                idx += bin_size

        # initialise with equal distance
        elif init_method == "step":
            len_sr = len(sr_feature_sorted)
            sr_feature_sorted = sr_feature_sorted[ int(0.05*len_sr) : int(0.95*len_sr) ].reset_index(drop=True)## follow book, ignore < 5% and > 95%
            value_min = sr_feature_sorted[0]
            value_max = sr_feature_sorted[len(sr_feature_sorted)-1]
            dist = (value_max-value_min) / (len(sr_feature_sorted) / (bin_size*0.9)) ## (len(sr_feature_sorted) / bin_size) is number of bins to start with
            cur_val = value_min

            # if dist > 0.01:  # round the cut points for simplicity
            #     print("cut ponints will round to 6 dp.")
            #     # dist = round(dist, 6)
            
            # go through each cut point, add to lists
            while (cur_val < value_max*1.001):
                if dist > 0.01:
                    cur_val = round(cur_val, 6)
                lst_bin_interval.append(pd.Interval(left = lst_bin_low[-1], right = cur_val, closed = 'right'))
                lst_bin_up.append(cur_val)
                lst_bin_low.append(cur_val)
                cur_val += dist

            lst_bin_interval.append(pd.Interval(left = lst_bin_low[-1], right = cur_val, closed = 'right'))
            lst_bin_up.append(value_max)
            lst_bin_low.append(value_max)

        ## assume the highest bin is small, merge with 2nd highest bin, set upper bound as inf
        lst_bin_low = lst_bin_low[:-1]
        lst_bin_up[-1] = np.inf
        lst_bin_interval[-1] = pd.Interval(left = lst_bin_low[-1], right = np.inf, closed = 'right')

        ## create the df of normal bins to return
        df_bin_interval = pd.DataFrame(columns= ['bin', 'bin_low', 'bin_up', 'total', 'total_rate', 'bad', 'bad_rate'])
        df_bin_interval.bin = lst_bin_interval
        df_bin_interval.bin_low = lst_bin_low
        df_bin_interval.bin_up = lst_bin_up
        df_bin_interval.index.name = 'bin_num'
        
        # calculate 'total', 'total_rate', 'bad', 'bad_rate'
        for idx, row in df_bin_interval.iterrows():
            df_bin_interval.loc[idx,'total'] = len(sr_feature[(sr_feature > row.bin_low) & (sr_feature <= row.bin_up)])
            df_bin_interval.loc[idx,'total_rate'] = df_bin_interval.loc[idx,'total'] / len(sr_feature)
            df_bin_interval.loc[idx, 'bad'] = len(y[((sr_feature > row.bin.left) & (sr_feature <= row.bin.right)) & y==1])
            if df_bin_interval.loc[idx,'total'] != 0:
                df_bin_interval.loc[idx, 'bad_rate'] = df_bin_interval.loc[idx, 'bad'] / df_bin_interval.loc[idx,'total']
        
        ## merge small bins, since equal distance will have empty/small bins 
        if init_method == "step" or init_merge_small_bin:
            while (df_bin_interval.total.min()<bin_size):
                idx = df_bin_interval.total.astype(int).idxmin()
                if idx == 0:
                    ## left most bin, no choice, merge with right neighbor
                    df_bin_interval = self.merge_cont_bin(df_bin_interval, idx, idx + 1)
                elif idx == len(df_bin_interval) - 1:
                    ## right most bin, merge with left neighbor
                    df_bin_interval = self.merge_cont_bin(df_bin_interval, idx - 1, idx)
                else:
                    bad_rate = df_bin_interval.bad_rate[idx]
                    bad_rate_right = df_bin_interval.bad_rate[idx + 1]
                    bad_rate_left = df_bin_interval.bad_rate[idx - 1]
                    diff_left = bad_rate - bad_rate_left
                    diff_right = bad_rate_right - bad_rate
                    merge_right = diff_right < diff_left  ## True False but used as 1 and 0 in the next line, to decide where to merge
                    df_bin_interval = self.merge_cont_bin(df_bin_interval, idx - 1 + merge_right, idx + merge_right)
        
        ## create the df of NA bins
        df_na_bin = pd.DataFrame(columns = ['bin', 'total', 'total_rate', 'bad', 'bad_rate'])       
        df_na_bin.bin = lst_na_lst
        lst_na_exist = []

        for idx, row in df_na_bin.iterrows():
            row.total = sr_feature.isin(row.bin).sum()
            row.total_rate = row.total / len(sr_feature)
            row.bad = len(y[sr_feature.isin(row.bin) & y==1])
            if row.total != 0:
                row.bad_rate = row.bad / row.total
                lst_na_exist += row.bin
            elif dict_na:
                print(row.bin[0], ", this missing value does not exist in ",feature_name)

        self.missing_values_found[feature_name] = lst_na_exist # update object attribute, for later checking in transform()

        if multi_missing == False:
            df_temp = pd.DataFrame(columns=['bin', 'total', 'total_rate', 'bad', 'bad_rate'])
            if len(lst_na_exist) == 0:
                lst_na_exist = [np.nan]
            df_temp.bin = [lst_na_exist]
            df_temp.total[0] = df_na_bin.total.sum()
            df_temp.bad[0] = df_na_bin.bad.sum()
            df_temp.total_rate = df_temp.total / len(sr_feature)
            df_temp.bad_rate = df_temp.bad / df_temp.total
            df_na_bin = df_temp

        return df_na_bin, df_bin_interval

    def map_bin(self, sr_feature, df_bin_interval, **kwargs):
        ## maps both categorical and numerical x
        ## sr_feature data should be 1 column of series-like
        inplace = kwargs.get("inplace", False)  ## by default will not overwrite sr_feature values, but add a column "bin"
        cat = kwargs.get('cat', False)
        bin_only = kwargs.get('bin_only', None)

        ## df is to record intermediate, will be returned
        var_name = sr_feature.name
        df = pd.DataFrame(sr_feature, columns=[var_name])  
        df[(var_name+'_bin')] = df[var_name]
        lst_bins = []
        cat_count = 0

        ## Mapping starts, iterates by intevals, for categorical, and NA bins of numerical, row.bin is a list, other numerical row.bin is a pd.Interval
        if cat:
            for idx, row in df_bin_interval.iterrows():
                # df.loc[(df[var_name].isin(row.bin)), (var_name+'_bin')] = idx
                if bin_only is None:
                    value = idx
                elif bin_only == True:
                    value = row.bin
                elif bin_only == False:
                    value = row.woe

                df[(var_name+'_bin')] = df[(var_name+'_bin')].replace(row.bin, value)
        else:
            for idx, row in df_bin_interval.iterrows():
                
                if bin_only is None:
                    value = idx
                elif bin_only == True:
                    value = row.bin
                elif bin_only == False:
                    value = row.woe

                if type(row.bin) == pd.Interval:
                    # df.loc[(df[var_name] > row.bin.left) & (df[var_name] <= row.bin.right), (var_name+'_bin')] = idx
                    df[(var_name+'_bin')] = df[(var_name+'_bin')].mask( ((df[var_name] > float(row.bin.left)) & (df[var_name] <= float(row.bin.right) ) ) , value) 
                    lst_bins.append(row.bin.left)

                else:
                    # df.loc[(df[var_name].isin(row.bin)), (var_name+'_bin')] = idx
                    df[(var_name+'_bin')] = df[(var_name+'_bin')].replace(row.bin, value)
                    cat_count += 1
            
            # lst_bins.append(np.inf)
            # df[(var_name+'_bin')] = pd.cut(df[(var_name+'_bin')], bins = lst_bins, labels=False, right=True)+cat_count

        if inplace: 
            df = df.drop(columns=[var_name])
            df.columns = [var_name]

        return df

    def calc_chi2(self, df_mapped, y, df_bin_interval, **kwargs):
        ## deal with both continuous feature, expect X have 2 columns, just the X var + mapping output
        ## df_bin_interval is the output from initialisation (same frequency or same distance)
        label = kwargs.get("label", self.label)
        var_name = df_mapped.columns[0]
        df_mapped = pd.concat([df_mapped, y], axis=1)
        df_mapped.columns = [var_name, label]
        cols = ["bin","bin_low", "bin_up", "sample_count", "bad_count", "good_count", "bad_rate", "bad_count_exp",
                "good_count_exp", "chi2", "chi2_after_merge_with_left"]

        total_bad = df_mapped[label].sum()  ## find the total bad count and good count
        total_good = len(df_mapped) - total_bad

        ## working df, to be returned
        df = pd.DataFrame(columns=cols, index=df_bin_interval.index.astype(int))
        starting_idx = df_bin_interval.index.astype(int).min()
        df.loc[:, ["bin", 'bin_low', 'bin_up']] = df_bin_interval.loc[:, ["bin", 'bin_low', 'bin_up']]

        for idx, row in df.iterrows():
            row.sample_count = len(df_mapped.loc[(df_mapped[var_name] == idx)])
            row.bad_count = len(df_mapped.loc[(df_mapped[var_name] == idx) & (df_mapped[label] == 1)])
            row.good_count = len(df_mapped.loc[(df_mapped[var_name] == idx) & (df_mapped[label] == 0)])
            row.bad_count_exp = (row.sample_count) / len(df_mapped) * total_bad
            row.good_count_exp = (row.sample_count) / len(df_mapped) * total_good
            row.chi2 = chisquare([row.bad_count, row.good_count], f_exp=[row.bad_count_exp, row.good_count_exp])[0]
            if idx >  starting_idx:
                row.chi2_after_merge_with_left = row.chi2 + df.chi2[idx - 1]
            if row.sample_count != 0:
                row.bad_rate = row.bad_count / row.sample_count
            else:
                row.bad_rate = np.nan

        return df

    def merge_pair(self, df_chi2, idx_left, idx_right):  
        ## merge row with idx_left and idx_right, called by chi2_merge(), both cat and continuous

        df = df_chi2  ## will return this df
        count_toal = df.sample_count.sum()
        bad_total = df.bad_count.sum()
        good_total = df.good_count.sum()

        row = df.loc[idx_left]
        next_row = df.loc[idx_right]

        try:
            row.bin_up = next_row.bin_up # assign upper interval, continuous
            row.bin = pd.Interval(left = row.bin.left, right = row.bin_up, closed = 'right')
        except:
            row.bin += next_row.bin # merge list, cat

        row.sample_count += next_row.sample_count
        row.bad_count += next_row.bad_count
        row.good_count += next_row.good_count
        row.bad_count_exp = row.sample_count / count_toal * bad_total
        row.good_count_exp = row.sample_count / count_toal * good_total
        row.chi2 = chisquare(f_obs=[row.bad_count, row.good_count], f_exp=[row.bad_count_exp, row.good_count_exp])[0]

        if row.sample_count != 0:
            row.bad_rate = row.bad_count / row.sample_count
        else:
            row.bad_rate = np.nan

        if idx_left > df.index.min():
            row.chi2_after_merge_with_left = row.chi2 + df.loc[idx_left - 1, 'chi2']  ## the left neighbor of left bin
        if idx_left + 2 < len(df_chi2):
            ## because the second last row does not have index+2 row, update the chi2 if merge with right bin's right neighbor
            df.loc[idx_left + 2, 'chi2_after_merge_with_left'] = row.chi2 + df.loc[idx_left + 2, 'chi2']

        df.loc[idx_left] = row
        return df.drop([idx_right]).reset_index(drop=True)

    def chi2_merge(self, df_chi2, **kwargs):

        chimerge_threshold = kwargs.get("chimerge_threshold", self.chimerge_threshold)
        min_bin = kwargs.get("min_bin", self.min_bin)
        max_bin = kwargs.get("max_bin", self.max_bin)
        ## merge all bins pairs with chi2 < chimerge_threshold, starting with lowest chi1 value
        ## stop when min_bin is reached, or when no more chi2 < critical
        while len(df_chi2) > min_bin:
            sr_chi2 = df_chi2['chi2_after_merge_with_left'][1:]  ## index 0's value is NA, we use index 1 onwards
            idx_min_chi2 = sr_chi2.astype(float).idxmin()
            if df_chi2.loc[idx_min_chi2, 'chi2_after_merge_with_left'] > chimerge_threshold:
                break  ## stop this loop if no more chi2 < threshold
            idx_right = idx_min_chi2
            idx_left = idx_min_chi2 - 1
            df_chi2 = self.merge_pair(df_chi2, idx_left, idx_right)
        
        ## further merge bins if max_bin < current bin count
        if max_bin is not None:  
            while max_bin < len(df_chi2):
                sr_chi2 = df_chi2['chi2_after_merge_with_left'][1:]
                idx_min_chi2 = sr_chi2.astype(float).idxmin()
                idx_right = idx_min_chi2
                idx_left = idx_min_chi2 - 1
                df_chi2 = self.merge_pair(df_chi2, idx_left, idx_right)

        df_bin_interval = df_chi2.drop(
            columns=["good_count", "bad_count_exp", "good_count_exp", "chi2", "chi2_after_merge_with_left"]).copy() ## chi2 intermediate workings are dropped
        df_bin_interval.columns = df_bin_interval.columns.tolist()[:-3] + ["total", 'bad', 'bad_rate'] ## handles both cat and continuous
        df_bin_interval['total_rate'] = df_bin_interval.total / df_bin_interval.total.sum()
        cols = df_bin_interval.columns.tolist()[:-4] + ['total', 'total_rate', 'bad', 'bad_rate'] ## re-order the columns
        df_bin_interval = df_bin_interval[cols].reset_index(drop=True)

        return df_bin_interval, df_chi2

    def find_cut_point(self, df_bin_interval, bin_num_temp, **kwargs ):
        ## df_bin_temp is df_bin_interval after adding columns in self.top_down_cut()
        method = kwargs.get("method","iv")
        df_bin_temp = df_bin_interval.loc[df_bin_interval.bin_temp == bin_num_temp]
        if len(df_bin_temp) == 1:
            return -1, -1
       
        best_cut_right = -1
        score_best = -1
    
        if method=='iv':
            ## try all cut points within the rows in df
            # if 5 bins 0,1,2,3,4 will try cut at 1,2,3,4 bin < cut_point is bin_left. 
            # eg cut_point is 2, left is 0,1, right is 2,3,4
            
            iv_best = -1
            eps = np.finfo(np.float32).eps
            
            for cut_point in range( df_bin_temp.index.min()+1 , df_bin_temp.index.max() ):     
                bin_left = df_bin_temp.loc[:cut_point-1, :]
                bin_right = df_bin_temp.loc[cut_point: ,:]
                # represent the parts in WOE in variables
                good_over_good_total_left = (bin_left.total.sum() - bin_left.bad.sum()) / (df_bin_temp.total.sum() - df_bin_temp.bad.sum())
                good_over_good_total_right = (bin_right.total.sum() - bin_right.bad.sum()) / (df_bin_temp.total.sum() - df_bin_temp.bad.sum())
                bad_over_bad_total_left =  bin_left.bad.sum() / df_bin_temp.bad.sum() 
                bad_over_bad_total_right = bin_right.bad.sum() / df_bin_temp.bad.sum()
                
                ## to give a very high value when good_over_good_total = 0
                woe_left =  np.log( (bad_over_bad_total_left + eps) / (good_over_good_total_right + eps) )  
                woe_right = np.log( ( bad_over_bad_total_right + eps) / (good_over_good_total_right + eps) )

                ## left side iv
                iv = ( bad_over_bad_total_left - good_over_good_total_left ) * woe_left

                ## right side iv
                iv = iv + (bad_over_bad_total_right - good_over_good_total_right) * woe_right
                if iv > iv_best:
                    iv_best = iv
                    best_cut_right = cut_point

            score_best = iv_best

        if method == "chi":
           
            chi2_best = -1
            eps = np.finfo(np.float32).eps
            overall_bad_rate = df_bin_temp.bad.sum() / df_bin_temp.total.sum() 
            overall_good_rate = 1 - overall_bad_rate

            for cut_point in range( df_bin_temp.index.min()+1 , df_bin_temp.index.max() ):     
                bin_left = df_bin_temp.loc[:cut_point-1, :]
                bin_right = df_bin_temp.loc[cut_point: ,:]
                # represent the parts in WOE in variables

                expected_bad_left = bin_left.total.sum() * overall_bad_rate
                expected_good_left = bin_left.total.sum() * overall_good_rate
                good_left = bin_left.total.sum() - bin_left.bad.sum()
                # chi2_left =  ( (bin_left.bad.sum() - expected_bad_left)**2 / expected_bad_left ) + ( (good_left - expected_good_left)**2 / expected_good_left )
                chi2_left = chisquare([bin_left.bad.sum(), good_left], f_exp=[expected_bad_left, expected_good_left])[0]
                
                ## + eps
                expected_bad_right = bin_right.total.sum() * overall_bad_rate
                expected_good_right = bin_right.total.sum() * overall_good_rate
                good_right = bin_right.total.sum() - bin_right.bad.sum()
                # chi2_right =  ( (bin_right.bad.sum() - expected_bad_right)**2 / expected_bad_right ) + ( (good_right - expected_good_right)**2 / expected_good_right )
                chi2_right = chisquare([bin_right.bad.sum(), good_right], f_exp=[expected_bad_right, expected_good_right])[0]
                chi2_total = chi2_left + chi2_right
                # print("line 477 debug chi2_cut, chi2 is ",chi2_total, ", cut point is ",cut_point )
                
                if chi2_total > chi2_best:
                    chi2_best = chi2_total
                    best_cut_right = cut_point
            
            ## update best score
            score_best = chi2_best

        if method == "entropy":
            ent_best = -1
            overall_bad_rate = df_bin_temp.bad.sum() / df_bin_temp.total.sum() 
            overall_good_rate = 1 - overall_bad_rate
            # entropy_total true for all cuts
            total_sample = df_bin_temp.total.sum() 
            entropy_total = 0 - overall_bad_rate * (np.log(overall_bad_rate)) - overall_good_rate * (np.log(overall_good_rate))

            for cut_point in range( df_bin_temp.index.min()+1 , df_bin_temp.index.max() ): 
                bin_left = df_bin_temp.loc[:cut_point-1, :]
                bin_right = df_bin_temp.loc[cut_point: ,:]

                bad_rate_left = bin_left.bad.sum() / bin_left.total.sum()
                good_rate_left = 1 - bad_rate_left
                total_rate_left = bin_left.total.sum() / total_sample

                bad_rate_right = bin_right.bad.sum() / bin_right.total.sum()
                good_rate_right = 1 - bad_rate_right
                total_rate_right = bin_right.total.sum() / total_sample
                
                entropy_conditinal = 0

                entropy_temp_left = 0
                entropy_temp_left -= good_rate_left * np.log( good_rate_left )
                entropy_temp_left -= bad_rate_left * np.log( bad_rate_left )
                entropy_conditinal = entropy_conditinal + total_rate_left * entropy_temp_left

                entropy_temp_right = 0
                entropy_temp_right -= good_rate_right * np.log( good_rate_right )
                entropy_temp_right -= bad_rate_right * np.log( bad_rate_right )
                entropy_conditinal = entropy_conditinal + total_rate_right * entropy_temp_right

                entropy_cut = 1 - (entropy_conditinal / entropy_total)  
                # print("line 519 debug entropy ---- entropy_cut is ", entropy_cut, ", cut point is ",cut_point, " ent cond and ent total is: ", entropy_conditinal, entropy_total )

                if entropy_cut > ent_best:
                    ent_best = entropy_cut
                    best_cut_right = cut_point
            
            ## update best score
            score_best = ent_best

        return best_cut_right, score_best

    def cut_and_evaluate(self, df_bin_interval, bin_num_temp, **kwargs):
         ## df_bin_temp is df_bin_interval after adding columns in self.top_down_cut()
        method = kwargs.get("method", "iv")
        force_cut = kwargs.get("force_cut", False)
        best_cut_right, score = self.find_cut_point(df_bin_interval, bin_num_temp, **kwargs)
        df_bin_interval.loc[df_bin_interval.bin_temp == bin_num_temp, "max_score_if_cut"] = score
        
        # decide wether to cut based on score and method
        old_score = df_bin_interval.loc[df_bin_interval.bin_temp == bin_num_temp, "score"].iloc[0]
        decide_cut_iv_entropy = ( (method=="iv" or method =="entropy") and score > old_score )
        decide_cut_chi2 = (method == "chi" and score > self.chimerge_threshold)  ## and score > old_score ??
        
        # score better than before, will cut into 2 parts
        if decide_cut_iv_entropy or decide_cut_chi2 or force_cut:
            df_bin_interval.loc[df_bin_interval.bin_temp == bin_num_temp, "score"] = score

            idx_min = df_bin_interval.loc[df_bin_interval.bin_temp == bin_num_temp].index.min()
            idx_max = df_bin_interval.loc[df_bin_interval.bin_temp == bin_num_temp].index.max()

            df_bin_interval.loc[idx_min:best_cut_right-1,"bin_temp"] = bin_num_temp*2 + 1
            df_bin_interval.loc[best_cut_right:idx_max,"bin_temp"] = bin_num_temp*2 + 2
            # print("cutting bin ", bin_num_temp," cut at ", best_cut_right, "score ", score )               

        # score no improvement, stop cutting this branch
        else:
            df_bin_interval.loc[df_bin_interval.bin_temp == bin_num_temp ,"keep_cutting"] = 0  
            # print("stop cutting bin ", bin_num_temp," sum of keep_cutting is ", df_bin_interval.keep_cutting.sum() )
            # score less than before, stop cutting for this temp bin
            
        return df_bin_interval     

    
    def cut_top_down(self, df_bin_interval, **kwargs):
        
        max_bin = kwargs.get("max_bin", self.max_bin)
        min_bin = kwargs.get("min_bin", self.min_bin)
        # pretend that all bins are in the same initial temp bin 0
        df_bin_interval["bin_temp"] = 0
        df_bin_interval["score"] = 0
        df_bin_interval["max_score_if_cut"] = 0
        df_bin_interval["keep_cutting"] = 1
        
        keep_cutting = (df_bin_interval["keep_cutting"].sum()>0)
        
        # start looking at each temp bin and cut
        while keep_cutting:
            
            # find unique temp bins
            lst_current_bins = df_bin_interval.bin_temp.unique().tolist()
            for bin_num_temp in lst_current_bins:
                
                ## only try cutting if this temp bin is labelled keep_cutting == 1
                if df_bin_interval.loc[df_bin_interval.bin_temp == bin_num_temp, "keep_cutting"].sum()>0:
                    # print("cutting bin ", bin_num_temp)
                    df_bin_interval = self.cut_and_evaluate(df_bin_interval, bin_num_temp, **kwargs)

            has_bin_to_cut = (df_bin_interval["keep_cutting"].sum() > 0)
            below_max_bin = (df_bin_interval['bin_temp'].nunique() < max_bin)
            keep_cutting = ( has_bin_to_cut and below_max_bin )

        while (df_bin_interval['bin_temp'].nunique() < min_bin):
            # if min bin is not satisfied, keep cutting the highest score possible bin
            idx = df_bin_interval.max_score_if_cut.idxmax()
            bin_num_temp = df_bin_interval.bin_temp[idx]
            # print("to satisfy min bin, force cutting temp bin: ",bin_num_temp)
            df_bin_interval = self.cut_and_evaluate(df_bin_interval, bin_num_temp, force_cut = True, **kwargs)

        
        # merge the temp bins, using pandas aggregate methods
        df_group = pd.DataFrame(columns=["bin", "total", "total_rate", "bad", "bad_rate"])
        df_aggregate = df_bin_interval.groupby(by=["bin_temp"])["total","bad"].sum().loc[:,["total", "bad"] ].reset_index(drop=True)
        df_group.total = df_aggregate.total
        df_group.bad = df_aggregate.bad
        total_sample = df_group.total.sum()
        df_group.total_rate = df_group.total / total_sample
        df_group.bad_rate = df_group.bad / df_group.total

        # find the right intervals for each temp bin
        ar_left = df_bin_interval.groupby(by=["bin_temp"])["bin_low"].min().tolist()
        ar_right = df_bin_interval.groupby(by=["bin_temp"])["bin_up"].max().tolist()
        for idx, row in df_group.iterrows():
            df_group.bin[idx] = pd.Interval(left = ar_left[idx], right = ar_right[idx], closed = "right")

        # sort by bin interval, min to max
        df_group = df_group.sort_values(by=['bin']).reset_index(drop=True)

        return df_group  ## debug only, actual is df_group

    def set_significant_figures(self, sr_feature, unique_range):  # eg (1000,5000)

        if (len(sr_feature.unique()) < unique_range[1]):
            return sr_feature

        decimal_place = 6  # start trying with round(sr_feature, 6) 
        sr_result = sr_feature.copy()

        while (len(sr_result.unique()) > unique_range[1]):
            decimal_place -= 1
            sr_result = round(sr_feature, decimal_place)

        if (len(sr_result.unique()) < unique_range[0]):
            decimal_place += 1
            sr_result = round(sr_feature, decimal_place)

        print(sr_feature.name, " rounded to decimal point: ", decimal_place, "   unique values counts = ",len(sr_result.unique()))
        
        return sr_result
    
    def find_turn_count(self, sr):
        ## function to find the longest monotonically decreasing / increasing bad rates in a list
        ## called by self.force_monotone()
        turn_count = 0
        if len(sr) <= 2:
            return 0

        for idx in range(1, len(sr)-1):
            # if it is a turning point
            if (sr[idx-1] > sr[idx] and sr[idx] < sr[idx+1]) or (sr[idx-1] < sr[idx] and sr[idx] > sr[idx+1]):
                turn_count += 1

        return turn_count

    def merge_cont_bin(self, df_bin_interval, idx_left, idx_right):
        ## simply merging 2 bins without calculating chi2. Useful to handle df_bin_interval after chi2 or other method
        ## called by force_monotone()

        df_copy = df_bin_interval.copy()

        df_copy.loc[idx_left,"bad"] += df_copy.loc[idx_right,"bad"]
        df_copy.loc[idx_left, "total"] += df_copy.loc[idx_right,"total"]
        if 'bin_up' in df_copy.columns.to_list():
            df_copy.loc[idx_left, "bin_up"] = df_copy.loc[idx_right, "bin_up"] 
        df_copy.loc[idx_left, "bad_rate"] = df_copy.loc[idx_left,"bad"] / df_copy.loc[idx_left, "total"]
        df_copy.loc[idx_left, "bin"] = pd.Interval(left = df_copy.loc[idx_left, "bin"].left , right = df_copy.loc[idx_right, "bin"].right, closed = 'right')  
        df_copy = df_copy.drop(idx_right).reset_index(drop=True)
        
        return df_copy

    def choose_turning_point_and_neighbor(self, sr_bad_rate):
        ## called by force_monotone()
        ## sr_bad_rate can also be a pd series
        idx_res_left = 0
        idx_res_right = 1
        min_diff = 1 ## bad_rate is 0~1
        idx_turn = 1

        # find the indexes of the pair with closest bad_rate (from turning points)
        for idx in range (1, len(sr_bad_rate)-1):
            is_up_turn = (sr_bad_rate[idx-1] > sr_bad_rate[idx] and sr_bad_rate[idx] < sr_bad_rate[idx+1])
            is_down_turn = (sr_bad_rate[idx-1] < sr_bad_rate[idx] and sr_bad_rate[idx] > sr_bad_rate[idx+1])
            # if it is a turning point (first occurance)
            if is_up_turn or is_down_turn:
                diff_left = abs(sr_bad_rate[idx-1] - sr_bad_rate[idx])
                diff_right = abs(sr_bad_rate[idx] - sr_bad_rate[idx+1])
                # if difference with left is lower
                if diff_left <= diff_right and diff_left < min_diff:
                    idx_res_left, idx_res_right = idx-1, idx
                    min_diff = diff_left
                    idx_turn = idx
                # if difference with right is lower
                elif diff_left > diff_right and diff_right < min_diff:
                    idx_res_left, idx_res_right = idx, idx+1
                    min_diff = diff_right
                    idx_turn = idx

        # return is outside for loop, to find the minimum of bad_rate differences       
        return idx_res_left, idx_res_right, idx_turn
    
    def force_monotone(self, df_bin_interval, **kwargs):
        ## df_bin_interval has columns bin, total, total_rate, bad, bad_rate
        force_mono = kwargs.get('force_mono', 'u_shape')  ## possible values: 'u_shape', 'mono'
        max_bin = kwargs.get('max_bin', 10) ## need to

        if force_mono == 'u_shape':
            allowed_turns = 1  
        else:
            allowed_turns = 0
        
        ## start merging until allowed number of turning points are reached
        while (self.find_turn_count(df_bin_interval.bad_rate) > allowed_turns):
            
            # find the left and right index to merge
            idx_left, idx_right, idx_turn = self.choose_turning_point_and_neighbor(df_bin_interval.bad_rate)
            
            # print(self.find_turn_count(df_bin_interval.bad_rate)," turns, merging ",idx_left," ", idx_right)
            
            # merge the twn bins
            df_bin_interval = self.merge_cont_bin(df_bin_interval, idx_left, idx_right)

        # if u shape and has 1 last turn
        if force_mono == 'u_shape' and self.find_turn_count(df_bin_interval.bad_rate) == 1:
            idx_left, idx_right, idx_turn = self.choose_turning_point_and_neighbor(df_bin_interval.bad_rate)
            
            # if that turn is at beginning or end
            if idx_turn == 1 or idx_turn == len(df_bin_interval.bad_rate)-2:
            
                # we will merge that with neighbors, untill we see mono
                while (self.find_turn_count(df_bin_interval.bad_rate) > 0):
                    
                    # find the left and right index to merge
                    idx_left, idx_right, idx_turn = self.choose_turning_point_and_neighbor(df_bin_interval.bad_rate)
                    
                    # print(self.find_turn_count(df_bin_interval.bad_rate)," turns, merging ",idx_left," ", idx_right)
                    
                    # merge the twn bins
                    df_bin_interval = self.merge_cont_bin(df_bin_interval, idx_left, idx_right)
        
        return df_bin_interval

    def calc_woe(self, df_bin_interval):
        eps = np.finfo(float).eps
        total_count = df_bin_interval.total.sum()
        total_bad = df_bin_interval.bad.sum()
        total_good = total_count - total_bad

        df_bin_interval['good'] = df_bin_interval['total'] - df_bin_interval['bad']
        df_bin_interval['good_density'] = df_bin_interval['good'] / total_good
        df_bin_interval['bad_density'] = df_bin_interval['bad'] / total_bad
        df_bin_interval['woe'] = np.log((df_bin_interval['good_density'].astype('float64') + eps) / (df_bin_interval['bad_density'].astype('float64') + eps))
        df_bin_interval['iv'] = np.log((df_bin_interval['good_density'].astype('float64') + eps) / (df_bin_interval['bad_density'].astype('float64') + eps)) * (df_bin_interval['good_density'].astype('float64') - df_bin_interval['bad_density'].astype('float64'))
        
        return df_bin_interval.drop(columns=['good', 'good_density', 'bad_density'])

    def fit_single_cont(self, x, y, **kwargs):
        method = kwargs.get("method", "iv")
        force_mono = kwargs.get("force_mono", None)
        max_bin = kwargs.get("max_bin", self.max_bin)

        df_na_bin , df_bin_interval = self.init_cont(sr_feature=x, y=y, **kwargs)
        
        if (df_bin_interval.shape[0] < max_bin):
            ## if bin count after init < max_bin, skip the merging / cutting
            df_bin_interval = df_bin_interval.drop(columns = ['bin_low', 'bin_up'])
            print(x.name, "has limited unique values, count < max_bin, skipped merging / cutting")

        elif method == "chi_merge":
            # bottum up merging
            df_all_bin = pd.concat([df_na_bin, df_bin_interval], axis = 0).reset_index(drop = True)
            df_mapped = self.map_bin(x, df_all_bin, inplace = True) ## initial map to both NA and normal bins
            df_chi2 = self.calc_chi2(df_mapped, y, df_all_bin[len(df_na_bin):], **kwargs)
            df_bin_interval, df_chi2 = self.chi2_merge(df_chi2, **kwargs)
            df_bin_interval = df_bin_interval.drop(columns = ['bin_low', 'bin_up'])

        else: 
            # cutting by iv, chi2, or entropy
            df_bin_interval = self.cut_top_down(df_bin_interval, **kwargs) # high level method of top down cutting
        
        # post processing to find total rate
        total_sample = df_bin_interval.total.sum() + df_na_bin.total.sum()
        df_bin_interval.total_rate = df_bin_interval.total / total_sample

        # force monotone of bad rate
        if force_mono: 
            df_bin_interval = self.force_monotone(df_bin_interval, force_mono = force_mono)
        
        ## final merge with NA bins
        df_bin_interval = pd.concat([df_bin_interval, df_na_bin], axis = 0).reset_index(drop = True) ## final merge with NA bins            

        ## calculate woe and iv of each bin
        df_bin_interval = self.calc_woe(df_bin_interval)

        # drop bins where total count == 0
        dict_na = kwargs.get('missing_values', [])
        if type(dict_na) == list:
            df_bin_interval = df_bin_interval[df_bin_interval.total != 0].reset_index(drop = True)

        return df_bin_interval
    

    def fit_single_cat(self, x, y, **kwargs):
        ## expects x as a series object like df.column or df['column']
        method = kwargs.get("method", "chi_merge")
        max_bin = kwargs.get("max_bin", self.max_bin)
        merge_category = kwargs.get("merge_category", True)
        ## initialise the bins
        df_na_bin, df_bin_interval = self.init_cat_bin(x, y, **kwargs)

        if (df_bin_interval.shape[0] < max_bin):
            ## if bin count after init < max_bin, skip the merging / cutting
            print(x.name, "has limited unique values, count < max_bin, skipped merging")

        elif merge_category == False:
            print(x.name, "is categorical, not merging bins according to user's input merge_category")

        else:
            # if method == 'chi_merge':   ## Cat has only chi_merge
            df_chi2 = self.calc_chi2_cat(df_bin_interval)  
            df_bin_interval, df_chi2 = self.chi2_merge(df_chi2, **kwargs)

        if merge_category:
            # merge categorical bins that have the same bad rates
            while ( df_bin_interval.bad_rate.nunique() < len(df_bin_interval) ):
                ## to find 2 bins that are equal in bad rate, merge
                df_same_badrate = df_bin_interval.groupby("bad_rate").filter(lambda x: len(x) > 1)
                idx_left = df_same_badrate.index[0]
                idx_right = df_same_badrate.index[1] 
                df_bin_interval = self.merge_cat_bin(df_bin_interval, idx_left, idx_right)              
               
        df_bin_interval = pd.concat([df_na_bin, df_bin_interval], axis=0)

        #post processing
        total_sample = df_bin_interval.total.sum()
        df_bin_interval.total_rate = df_bin_interval.total / total_sample

        df_bin_interval = df_bin_interval.sort_values(by=['bad_rate']).reset_index(drop=True)
        
        df_bin_interval = self.calc_woe(df_bin_interval)

        # drop bins where total count == 0, when missing_values is list
        dict_na = kwargs.get('missing_values', [])
        if type(dict_na) == list:
            df_bin_interval = df_bin_interval[df_bin_interval.total != 0].reset_index(drop = True)

        return df_bin_interval


    def fit(self, df_feature, df_label, **kwargs):
        # note: df_label is y in sklearn, it is a series like df.dpd30

        lst_cat_feature = kwargs.get("categorical_features", []) ## default assume 0 categorical features
        label = kwargs.get("label", df_label.name)
        self.label = label
        unique_range = kwargs.get("unique_range", None)
        feature_list = kwargs.get("feature_list", df_feature.columns.tolist())
        lst_excluded_ft = kwargs.get("exclude", [])
        dict_na = kwargs.get("missing_values", {})

        self.numerical_features = list(set(df_feature.columns.tolist()) - set(lst_cat_feature) - set([label]))
        self.categorical_features = lst_cat_feature
        if len(lst_cat_feature) == 0:
            print("no categorical_features list is passed, assuming all features are numerical.")

        lst_bin = list()
        lst_ft = list()
        lst_iscat = list() ## A list of boolean values, storing if a feature is categorical
        
        ## fit features that are categorical
        for feature_name in lst_cat_feature:
            if feature_name not in df_feature.columns.to_list():
                print("------- ",feature_name," in param lst_cat_feature NOT found in Dataframe columns, skipped, please check ----------")
                continue
            if (len(df_feature[feature_name])!=len(df_label)):
                print("fit() skipped for this feature. Please make sure length of x and y are the same for x feature name: ", feature_name)
                continue
            if (feature_name == label) or (feature_name in lst_excluded_ft) or (feature_name not in feature_list):
                continue
            
            print("------- fitting: ",feature_name, " -------")
            ## assume all categorical value is str, also force to str in self.transform()
            sr_x = df_feature[feature_name].astype(str)
            df_bin_interval = self.fit_single_cat(sr_x, df_label, **kwargs) 
            lst_bin.append(df_bin_interval)
            lst_ft.append(feature_name)
            lst_iscat.append(True)

        ## fit features that are continuous
        for feature_name in self.numerical_features:
            if (feature_name == label) or (feature_name in lst_excluded_ft) or (feature_name not in feature_list):
                continue

            print("------- fitting: ",feature_name, " -------")
            sr_x = df_feature[feature_name]
            if unique_range is not None:
                sr_x = self.set_significant_figures(sr_x, unique_range)
            df_bin_interval = self.fit_single_cont(sr_x, df_label, **kwargs)
            lst_bin.append(df_bin_interval)
            lst_ft.append(feature_name)
            lst_iscat.append(False)

        ## prepare return model
        ## return model: df_bin_model has three columns ['feature_name', 'is_cat', 'bin_info']
        ## feature_name is a list of feature names
        ## is_cat indicates whether the feature is categorical(True) or numerical(False)        
        ## bin_info is df with columns [ bin, total, total_rate, bad, bad_rate ]
        df_bin_model = pd.DataFrame(columns=['feature_name', 'is_cat', 'bin_info'])
        df_bin_model['feature_name'] = lst_ft
        df_bin_model['is_cat'] = lst_iscat
        df_bin_model['bin_info'] = lst_bin ## Each bin in bin_info: if categorical, is a list of string values of that bin OR if numerical, is a pd.interval
        
        self.model = df_bin_model
        self._fit = True

        # make a copy of fit() result
        self.model_backup = self.model.copy()

        lst_df = []
        for index, row in self.model.iterrows():
            df_tmp = row['bin_info']
            df_tmp['var'] = row['feature_name']
            df_tmp['is_cat'] = row['is_cat']
            lst_df.append(df_tmp)

        self.woe_encoder = pd.concat(lst_df)

        return self

    def transform(self, df_feature, **kwargs):
        inplace = kwargs.get("inplace", True)
        bin_only = kwargs.get("bin_only", True) ## need to change lower level self.map_bin()
        lst_feature_names = kwargs.get("feature_list", [])  ## default transform all, unless specify the columns
        lst_exclude = kwargs.get("exclude", [])
        dict_na = kwargs.get("missing_values", None)

        if self._fit is False:
            raise ValueError("No model exists, please call self.fit(df_feature, df_label) to fit the model first")

        if dict_na is None:
            print("----- No missing_values list is passed in. -----")

        lst_trans = list()
        for idx, row in self.model.iterrows():
            
            name = row['feature_name']
            if name in lst_exclude:
                continue

            # will transform this feature if user never input feature_list, or the name is in the feature_list
            if (len(lst_feature_names)==0) or (name in lst_feature_names):
                print("----- transforming: ", name," -----")
                
                if dict_na:
                    lst_missing_found = self.missing_values_found.get(name,[])
                    lst_missing_user = dict_na.get(name,[])
                    if set(lst_missing_user) != set(lst_missing_found):
                        print(name," missing values found do not match what is passed.")
                        print("Found but not passed:", set(lst_missing_found)-set(lst_missing_user)," Passed but not found: ", set(lst_missing_user)-set(lst_missing_found))

                if row['is_cat'] == True:
                    # transform single categorical feature
                    df_trans = self.map_bin(df_feature[name].astype(str), row.bin_info, inplace = True, bin_only = bin_only, cat = True)
                    lst_trans.append(df_trans)
                else:
                    df_trans = self.map_bin(df_feature[name], row.bin_info, inplace = True, bin_only = bin_only, cat = False)
                    lst_trans.append(df_trans)

        df = pd.concat(lst_trans, axis = 1)
        df_copy = df_feature.copy()
        df_copy.update(df)
        
        if inplace:
            df_feature.update(df)

        return df_copy

    def evaluate_model_bin_count(self):
        if self._fit is False:
            print("No model yet, please call self.fit() first")
            return
        
        feature_count = self.model.shape[0]

        lst_bin_count = []
        for idx in range(feature_count):
            bin_count = self.model.bin_info[idx].shape[0]
            lst_bin_count.append(bin_count)

        sr_bin_count = pd.Series(lst_bin_count)

        bin_min = sr_bin_count.min()
        bin_max = sr_bin_count.max()
        bin_mean = sr_bin_count.mean()
        print("min, max, mean of bin count is : ", bin_min, " ", bin_max," ", bin_mean)

        self.model["bin_count"] = sr_bin_count

        return self.model

    def set_rules(self, dict_rules, data):  ## user has to pass in data (df), in order to re-calculate bad, total, woe and iv

        if self._fit is False:
            print("No model yet, please call self.fit() first")
            return

        if dict_rules == "recover":
            self.model = self.model_backup.copy()
            dict_rules = {}

        for key in dict_rules:
            
            if key in self.model.feature_name.to_list():
                
                row_feature = self.model.loc[self.model.feature_name == key]
                feature_is_cat = row_feature.is_cat.iloc[0]
                df_bin_interval_user = pd.DataFrame(columns= ['bin', 'total', 'total_rate', 'bad', 'bad_rate'])
                lst_user_bin = dict_rules.get(key) ## it will be a list of lists for cat / list of integers for continuous
                df_bin_info = row_feature.bin_info.iloc[0]

                if  feature_is_cat:

                    lst_cat_values = list()
                    # find all the categorical values of this feature
                    for index, row_bin in df_bin_info.iterrows():
                        lst_cat_values += row_bin.bin
                    
                    lst_user_values = list()
                    # find the set of value that user passed in
                    for lst_one_bin in lst_user_bin:
                        lst_user_values += lst_one_bin

                    lst_values_not_in_dict = list(set(lst_cat_values)-set(lst_user_values))

                    if len(lst_values_not_in_dict) > 0:
                        # append the values that user did not pass in as the last bin
                        lst_user_bin.append(lst_values_not_in_dict)

                    df_bin_interval_user.bin = lst_user_bin
                
                else:
                    # for continuous expect a list like [0,2,4,6,8,12]
                    # default will not expect user to change NA bins
                    lst_na_bins = df_bin_info.loc[ df_bin_info['bin'].map(type) == list, 'bin'].to_list()
                    
                    sr_user_bin = pd.Series(lst_user_bin)
                    
                    # if user passes in something like [[-999900],[-999901,-999902],0,2,4,6,9]
                    if (sr_user_bin.map(type) == list).sum()>0:
                        # we will overwrite the na bins as he wishes
                        lst_na_bins = sr_user_bin[sr_user_bin.map(type) == list].to_list()
                        lst_user_bin = sr_user_bin[sr_user_bin.map(type) != list].to_list()

                    if -990000 not in lst_user_bin:
                        lst_user_bin = [-990000]+lst_user_bin
                    if np.inf not in lst_user_bin:
                        lst_user_bin.append(np.inf)

                    lst_user_bin.sort()

                    lst_bin_low = lst_user_bin[:-1]
                    lst_bin_up = lst_user_bin[1:]

                    lst_bin_interval = list()

                    for i in range(0, len(lst_bin_low)):
                        bin_interval = pd.Interval(left = lst_bin_low[i], right = lst_bin_up[i], closed = 'right')
                        lst_bin_interval.append(bin_interval)        

                    # merge the NA bins
                    lst_bin_interval = lst_bin_interval + lst_na_bins

                    df_bin_interval_user.bin = lst_bin_interval

                ## re-calculate woe , iv .... if there is df passed in, and the feature is found in data
                if (data is not None) and (key in data.columns.to_list()):
                    sr_feature = data[key]
                    y = data[self.label]
                    df = pd.concat([sr_feature, y], axis=1)

                    for idx, row in df_bin_interval_user.iterrows():

                        if type(row.bin) == list:
                            df_bin_interval_user.loc[idx,'total'] = df[sr_feature.name].isin(df_bin_interval_user.loc[idx, 'bin']).sum()
                            df_bin_interval_user.loc[idx, 'bad'] = len(df.loc[(df[sr_feature.name].isin(row.bin)) & (df[y.name] == 1)]) 
                        else:
                            df_bin_interval_user.loc[idx,'total'] = len(sr_feature[(sr_feature > row.bin.left) & (sr_feature <= row.bin.right)])
                            df_bin_interval_user.loc[idx, 'bad'] = len(y[((sr_feature > row.bin.left) & (sr_feature <= row.bin.right)) & y==1])

                        df_bin_interval_user.loc[idx,'total_rate'] = df_bin_interval_user.loc[idx,'total'] / len(sr_feature)
                        if df_bin_interval_user.loc[idx,'total'] != 0:
                            df_bin_interval_user.loc[idx, 'bad_rate'] = df_bin_interval_user.loc[idx, 'bad'] / df_bin_interval_user.loc[idx,'total']

                    df_bin_interval_user = self.calc_woe(df_bin_interval_user)

                # update the bin_info in model
                self.model.bin_info[row_feature.index[0]] = df_bin_interval_user.copy()
                print("updated bins for ",key," , the df is now like:")
                print(self.model.bin_info[row_feature.index])
                print("user bin df is:")
                print(df_bin_interval_user)
            
            # if key not in model.feature_name
            else:
                print("column name ", key, " not in model.feature_name")

        lst_df = []
        for index, row in self.model.iterrows():
            df_tmp = row['bin_info']
            df_tmp['var'] = row['feature_name']
            df_tmp['is_cat'] = row['is_cat']
            lst_df.append(df_tmp)

        self.woe_encoder = pd.concat(lst_df)

        return self

    def drop_empty_missing_bin(self):
        if self._fit is False:
            print("No model yet, please call self.fit() first")
            return
        
        lst_df = []
        
        for idx, row in self.model.iterrows():
            row.bin_info = row.bin_info.loc[row.bin_info.total > 0].reset_index(drop = True)
            df_tmp = row['bin_info']
            df_tmp['var'] = row['feature_name']
            df_tmp['is_cat'] = row['is_cat']
            lst_df.append(df_tmp)

        self.woe_encoder = pd.concat(lst_df)

# Params

fit()

In [None]:
## possible params in fit()

#   param name               default         other possible values                 explanation

    # method                 "iv"          "chi", "chi_merge", "entropy"          the method of top-down cutting, or bottom up merging
    # init_method            "quantile"          "step"                           initialisation method
    # min_sample             0.01           can be int >1 or float 0<x<1          minimum sample ration of a bin, when <1, minimum sample count in a bin, when >1
    # min_bin                2               int > 1                              min number of bins
    # max_bin                10              int > 1                              max number of bins
    # missing_values         {}               dict or list                        user's NA list / dict, dict can be for each feature
    # force_mono             None        "u_shape" , "mono" (or any other str)    None is no forcing monotone, "u_shape" is allowing max 1 turn in bad rates, any other str (eg. "mono") is for strictly monotonous
    # unique_range           None           tuple of (int, int)                   None means no change in 精度. the range of unique values allowed for a numerical feature, eg 0.9876543212345 may be turned into 0.9877
    # merge_category         True                   False                         whether a categorical feature will have each value as 1 bin (False will lead to merging of small bins, and chi_merge)
    # multi_missing          False          True or None                          True will make each unique missing value in 1 bin. False will make all in 1 bin. None will allow na bins in merging / cutting, not recommended
    # init_merge_small_bin   True                False                            Default will merge small bins < bin_size, EVEN when the number of unique values in feature is small

    # same as other codes:
        # exclude,  
        # feature_list, 
        # categorical_features

transform()

In [None]:
## kwargs:
    # inplace, default: True, will overwrite the df in parameters
                        # if False, the df in params will not be chaneged, only the returned df is updated
                        
    # bin_only, default: True , will overwrite with intervals like(1.5,5] 
                        # if False, will overwrite with woe value of that bin
                        # if None, will overwrite with bin number of that bin

    # feature_list: list of feature names that the user wants to transform, default [], all transform
    
    # exclude: list of feature names that users does not want to transform, default [], no exclude

    # missing_values: dictionary of "featurename" : [ list of missing values ]. default: {}

set_rules()

In [None]:
## params, only 2 and must have 2
    ## dict_rules : python dictionary like the example above, user can input 1 or many feature names
        # if dict_rules = "reset", will overwrite self.model with self.model_backup
            # in this case , we can use data = None

    ## data: pandas df, logically, it should be the same train data as in fit( )

# Pseudo codes

## bottom-up chi_merge (categorical)

In [None]:
def init_cat_bin(sr_feature, sr_y):
    ## find list of missing values = 
    # ((values passed by user) UNION (default values: "nan", None)) intersect (unique categorical values from this feature)
    df_na_bin = # df with bin column = list of missing values
    df_bin_interval = # df with bin column = the rest values
    return df_na_bin, df_bin_interval

## init with 2 df, 1 for missing value bins, 1 for normal value bins
df_na_bin , df_bin_interval = self.init_cat_bin(sr_feature, sr_y) # ---> function expanded:

init_cat_bin( sr_feature, sr_y)
if (len(df_bin_interval) < max_bin):
    # no enough unique values, skip merging

else: # chi2 merge
    ## initial chi2 calculation
    df_chi2 = self.calc_chi2_cat(df_bin_interval)

    def chi2_merge(df_chi2):
        while len(df_chi2) > min_bin:
            idx_left, idx_right, min_chi2 = # find the min_chi2 pair of bins, in df_chi2, and their indexes 
            if min_chi2 > chimerge_threshold:
                break
            else  ## merge the 2 bins
                df_chi2 = self.merge_pair(idx_left, idx_right)

        while max_bin < len(df_chi2):
            ## if bin count is > max_bin, merge the 2 bins with minimum chi2

        return df_chi2.drop( columns of intermediate workings)

    df_bin_interval = self.chi2_merge(df_chi2)
    
#sort the df by bad rate, so bad_rate will be monotonous
df_bin_interval = df_bin_interval.sort_values(by = ["bad_rate"])

# concat with the missing value bins
df_bin_interval = pd.concat([df_na_bin, df_bin_interval], axis=0)

df_bin_interval = # post processing to find woe, iv, total_rate and drop empty bins

## top-down cutting (numercial)

In [None]:
def init_cont(sr_feature, sr_y):
    ## find list of missing values = 
    # ((values passed by user) UNION (default values: np.nan, None) UNION (unique values < -990000 in feature)) intersect (unique values in this feature)
    df_na_bin = # df with bin column = list of missing values
    df_bin_interval = # df with bin column = the rest values, by equal distance / frequency
    return df_na_bin, df_bin_interval

## init with 2 df, 1 for missing value bins, 1 for normal value bins
df_na_bin , df_bin_interval = self.init_cont(sr_feature, sr_y) # ---> function expanded:

init_cat_bin( sr_feature, sr_y)
if (min_bin < len(df_bin_interval) < max_bin):
    # no enough unique values, skip cutting

else: # start cutting
    df_bin_interval = # add columns ["bin_temp", "score", "max_score_if_cut", "keep_cutting"] for cutting
    keep_cutting = df_bin_interval.keep_cutting.sum() ## all 1
    
    while keep_cutting:
        for bin_num_temp in lst_current_bins:
            
            ## only try cutting if this temp bin is labelled keep_cutting == 1
            if bin_num_temp has "keep_cutting" == 1 ("not set to 0 by lower methods"):
                
                def cut_and_evaluate(df_bin_interval, bin_num_temp):
                    # find best cut point and the best score
                    best_cut_right, score = self.find_cut_point(df_bin_interval, bin_num_temp)

                    decide_cut_iv_entropy = ( (method=="iv" or method =="entropy") and score > old_score )
                    decide_cut_chi2 = (method == "chi" and score > self.chimerge_threshold)

                    if decide_cut_iv_entropy or decide_cut_chi2 or force_cut:
                        ## cut the current temp bin at the best cut_point 
                        ## into bin_num_temp*2+1 and bin_num_temp*2+2, eg 3 will be come 7 and 8
                    else:
                        # set the "keep_cutting" of this part of df_bin_interval to 0
                        df_bin_interval.loc[df_bin_interval.bin_temp == bin_num_temp ,"keep_cutting"] = 0

                    return df_bin_interval

                df_bin_interval = self.cut_and_evaluate(df_bin_interval, bin_num_temp)

        keep_cutting = ( "some keep_cutting is still 1" and "length <= max_bin" )

    while len(df_bin_interval) > max_bin:
        ## force cut the df, while the length > than max_bin
        df_bin_interval = self.cut_and_evaluate(df_bin_interval, bin_num_temp, force_cut = True)
    
    df_bin_interval = # pandas groupby() to find bad, bad_rate, total, total_rate

    df_bin_interval = # post processing to find total rate

    if force_mono:   # force monotone of bad rate
        df_bin_interval = self.force_monotone(df_bin_interval, force_mono = force_mono)
    
    ## final merge with NA bins
    df_bin_interval = pd.concat([df_bin_interval, df_na_bin], axis = 0).reset_index(drop = True)     

    ## calculate woe and iv of each bin
    df_bin_interval = self.calc_woe(df_bin_interval)

    df_bin_interval = # drop bins where total count == 0

# Mid level methods


## init_cont()

In [10]:
helper = VarBinHelper()

In [11]:
## init_method = "step"
df_na, df_int = helper.init_cont(sample_df.tot_rev_debt, sample_df.bad_ind, init_method = "step", n_bins = 5, min_samples = 0.005)

In [12]:
df_na

Unnamed: 0,bin,total,total_rate,bad,bad_rate
0,"[nan, -999999.0]",549,0.0939264,175,0.318761


In [13]:
df_int

Unnamed: 0,bin,bin_low,bin_up,total,total_rate,bad,bad_rate
0,"(-990000, 0.0]",-990000.0,0.0,304,0.0520103,50,0.164474
1,"(0.0, 505.443172]",0.0,505.4432,735,0.125749,152,0.206803
2,"(505.443172, 1010.886344]",505.443172,1010.886,473,0.0809239,102,0.215645
3,"(1010.886344, 1516.329516]",1010.886344,1516.33,346,0.0591959,77,0.222543
4,"(1516.329516, 2021.772688]",1516.329516,2021.773,302,0.0516681,69,0.228477
5,"(2021.772688, 2527.21586]",2021.772688,2527.216,263,0.0449957,50,0.190114
6,"(2527.21586, 3032.659032]",2527.21586,3032.659,237,0.0405475,65,0.274262
7,"(3032.659032, 3538.102204]",3032.659032,3538.102,174,0.029769,26,0.149425
8,"(3538.102204, 4043.545376]",3538.102204,4043.545,187,0.0319932,35,0.187166
9,"(4043.545376, 4548.988548]",4043.545376,4548.989,150,0.025663,26,0.173333


In [14]:
## init_method = "quantile"
df_na, df_int = helper.init_cont(sample_df.tot_rev_debt, sample_df.bad_ind, init_method = "quantile", n_bins = 5, min_samples = 0.005)

In [15]:
df_na

Unnamed: 0,bin,total,total_rate,bad,bad_rate
0,"[nan, -999999.0]",549,0.0939264,175,0.318761


In [16]:
df_int  ## quantile will give fewer bins after init, since there are many small / empty bins need to merge

Unnamed: 0,bin,bin_low,bin_up,total,total_rate,bad,bad_rate
0,"(-990000, 1.0]",-990000.0,1.0,305,0.0520103,50,0.163934
1,"(1.0, 132.0]",1.0,132.0,233,0.0203593,43,0.184549
2,"(132.0, 296.0]",132.0,296.0,231,0.0196749,48,0.207792
3,"(296.0, 393.0]",296.0,393.0,116,0.019846,28,0.241379
4,"(393.0, 482.0]",393.0,482.0,117,0.0200171,22,0.188034
5,"(482.0, 685.0]",482.0,685.0,232,0.0196749,63,0.271552
6,"(685.0, 965.0]",685.0,965.0,231,0.0196749,41,0.177489
7,"(965.0, 1086.0]",965.0,1086.0,117,0.0200171,23,0.196581
8,"(1086.0, 1255.0]",1086.0,1255.0,116,0.019846,24,0.206897
9,"(1255.0, 1600.0]",1255.0,1600.0,232,0.0196749,56,0.241379


## fit_single_cont()

In [17]:
helper = VarBinHelper(label="bad_ind")
helper.fit_single_cont(sample_df.tot_rev_debt, sample_df.bad_ind, method = "chi", init_method = "step")
## but not ideal as the normal bins did not cut well

Unnamed: 0,bin,total,total_rate,bad,bad_rate,woe,iv
0,"(-990000.0, 0.0]",304,0.0520103,50,0.164474,0.268693,0.00346
1,"(0.0, 3032.659032]",2356,0.40308,515,0.218591,-0.082721,0.002826
2,"(3032.659032, 8592.533924]",1324,0.226518,226,0.170695,0.224092,0.010628
3,"(8592.533924, 12130.636128]",527,0.0901625,117,0.222011,-0.102635,0.000979
4,"(12130.636128, inf]",785,0.134303,114,0.145223,0.415952,0.020434
5,"[nan, -999999.0]",549,0.0939264,175,0.318761,-0.597149,0.039253


In [18]:
# force some cutting by passing min_bin
helper.fit_single_cont(sample_df.tot_tr, sample_df.bad_ind, min_bin = 4)

Unnamed: 0,bin,total,total_rate,bad,bad_rate,woe,iv
0,"(-990000.0, 13.0]",2268,0.388024,598,0.263668,-0.32963,0.046243
1,"(13.0, 16.0]",587,0.100428,110,0.187394,0.110418,0.001185
2,"(16.0, 23.0]",1184,0.202566,187,0.157939,0.317024,0.018475
3,"(23.0, inf]",1347,0.230453,186,0.138085,0.474672,0.044807
4,"[nan, -999999.0, -999902.0, -999901.0]",459,0.0785287,116,0.252723,-0.272478,0.006298


In [19]:
print(helper.min_bin)
helper.fit_single_cont(sample_df.tot_open_tr, sample_df.bad_ind, method = "iv")
# iv will still cut well, in some context

2


Unnamed: 0,bin,total,total_rate,bad,bad_rate,woe,iv
0,"(-990000.0, 6.0]",2998,0.512917,579,0.193129,0.073188,0.002688
1,"(6.0, 7.0]",405,0.06929,72,0.177778,0.174858,0.00201
2,"(7.0, 8.0]",306,0.0523524,55,0.179739,0.161501,0.001301
3,"(8.0, 9.0]",206,0.0352438,37,0.179612,0.162362,0.000885
4,"(9.0, inf]",511,0.0874251,100,0.195695,0.056804,0.000277
5,[nan],1419,0.242772,354,0.249471,-0.255185,0.016998


In [20]:
# or change from default method "iv" to "chi", which by nature gives more cuts
helper.fit_single_cont(sample_df.tot_tr, sample_df.bad_ind, method = "chi")

Unnamed: 0,bin,total,total_rate,bad,bad_rate,woe,iv
0,"(-990000.0, 8.0]",1287,0.220188,365,0.283605,-0.429971,0.045819
1,"(8.0, 13.0]",981,0.167836,233,0.237513,-0.190254,0.006416
2,"(13.0, 16.0]",587,0.100428,110,0.187394,0.110418,0.001185
3,"(16.0, inf]",2531,0.43302,373,0.147373,0.39874,0.060877
4,"[nan, -999999.0, -999902.0, -999901.0]",459,0.0785287,116,0.252723,-0.272478,0.006298


In [21]:
# more example
helper = VarBinHelper(label="bad_ind")
helper.fit_single_cont(sample_df.tot_tr, sample_df.bad_ind, method = "chi_merge")

Unnamed: 0,bin,total,total_rate,bad,bad_rate,woe,iv
0,"(-990000, 1.0]",150,0.025663,39,0.26,-0.31065,0.002703
1,"(1.0, 2.0]",132,0.0225834,42,0.318182,-0.594479,0.009348
2,"(2.0, 4.0]",292,0.0499572,75,0.256849,-0.294209,0.004698
3,"(4.0, 6.0]",334,0.0571429,88,0.263473,-0.328624,0.006767
4,"(6.0, 7.0]",198,0.0338751,68,0.343434,-0.708592,0.020436
5,"(7.0, 24.0]",3056,0.52284,594,0.194372,0.065231,0.002182
6,"(24.0, 27.0]",332,0.0568007,42,0.126506,0.575593,0.015716
7,"(27.0, 32.0]",413,0.0706587,66,0.159806,0.303051,0.005915
8,"(32.0, 40.0]",318,0.0544055,40,0.125786,0.582123,0.015364
9,"(40.0, inf]",161,0.0275449,27,0.167702,0.245384,0.001539


In [22]:
sr_copy = sample_df.tot_tr.copy()
print(type(sr_copy[102]))
sr_copy[sr_copy.isna()] = np.nan
print(type(sr_copy[102]))
sr_copy.unique().tolist()[5] 

<class 'numpy.float64'>
<class 'numpy.float64'>


nan

In [23]:
sample_df.loc[102,'tot_tr'] = np.nan
sample_df.loc[102,'tot_tr'] is np.nan

False

## init_cat_bin()

In [24]:
helper = VarBinHelper(label="bad_ind")
df_na, df_bin = helper.init_cat_bin(sample_df.vehicle_make, sample_df.bad_ind, 
                                    missing_values = {"vehicle_make":["SATURN"]}, merge_category = False)

In [25]:
df_na

Unnamed: 0,bin,total,total_rate,bad,bad_rate
0,"[None, SATURN, nan]",176,0.0301112,30,0.170455


In [26]:
df_bin.sort_values( by=["total"])

Unnamed: 0,bin,total,total_rate,bad,bad_rate
7,[VW],68,0.0116339,13,0.191176
23,"[MITSUBISHI, CHRY, HYNDAI, MITISBUSHI, CHEVY G...",70,0.00393499,43,0.614286
10,[PLYMOUTH],77,0.0131737,15,0.194805
13,[KIA],78,0.0133447,16,0.205128
20,"[HYUNDAI, SUBARU]",92,0.0114628,22,0.23913
3,[CHRYSLER],93,0.015911,15,0.16129
15,[MERCURY],95,0.0162532,20,0.210526
19,[OLDS],98,0.0167665,23,0.234694
5,[BUICK],99,0.0169376,17,0.171717
0,"[BUIUCK, RICART, SEDAN, MERC BENZ, M-B, CHEVRE...",117,0.000171086,0,0.0


## fit_single_cat()

In [27]:
helper = VarBinHelper(label="bad_ind")
helper.fit_single_cat(sample_df.vehicle_make, sample_df.bad_ind, missing_values = ["SATURN"])
## note: NAN values in this dataset are: str(np.nan), and None
# then when caling, user can treat any categorical value as NA, by passing missing_values = {str_name:list of NA values}, or a list like init above

Unnamed: 0,bin,total,total_rate,bad,bad_rate,woe,iv
0,"[BUIUCK, RICART, SEDAN, MERC BENZ, M-B, CHEVRE...",117,0.0200171,0,0.0,32.361635,0.814611
1,"[LEXUS, AUDI, PORSCHE, VOLKSWAGEN, MITS, ACURA...",146,0.0249786,12,0.0821918,1.056315,0.019864
2,"[MERCEDES, OLDSMOBILE, BMW, HONDA, MITSU, LINC...",295,0.0504705,40,0.135593,0.495765,0.010632
3,"[None, SATURN, nan]",176,0.0301112,30,0.170455,0.225791,0.001433
4,"[CHRYSLER, JEEP, BUICK, GMC, VW, TOYOTA, CHEVY...",3358,0.574508,668,0.198928,0.03639,0.000753
5,[FORD],1090,0.186484,249,0.22844,-0.13948,0.003777
6,"[OLDS, HYUNDAI, SUBARU]",190,0.0325064,45,0.236842,-0.186547,0.001193
7,"[PONTIAC, ISUZU]",262,0.0448246,68,0.259542,-0.308268,0.004646
8,"[MAZDA, SUZUKI, DAEWOO, MERC, PLYM, PLY, SAAB,...",141,0.0241232,42,0.297872,-0.499168,0.006883
9,"[MITSUBISHI, CHRY, HYNDAI, MITISBUSHI, CHEVY G...",70,0.011976,43,0.614286,-1.821982,0.054868


# Bottom-up step by step

## categorical

In [None]:
helper = VarBinHelper(label="bad_ind")
df_na_bin , df_bin_interval = helper.init_cat_bin(sample_df.vehicle_make, sample_df.bad_ind, missing_values = {"vehicle_make":["SATURN"]})
## here the cat value "SATURN" is recognised as missing value, it will be in 1 bin by itself

In [None]:
df_na_bin , df_bin_interval

(         bin total  total_rate bad  bad_rate
 21    [None]    39  0.00667237   9  0.230769
 20     [nan]    49  0.00838323   9  0.183673
 18  [SATURN]    88   0.0150556  12  0.136364,
                                                   bin total  ...  bad   bad_rate
 0   [BUIUCK, RICART, SEDAN, MERC BENZ, M-B, CHEVRE...   117  ...    0          0
 1   [LEXUS, AUDI, PORSCHE, VOLKSWAGEN, MITS, ACURA...   146  ...   12  0.0821918
 2   [MERCEDES, OLDSMOBILE, BMW, HONDA, MITSU, LINC...   295  ...   40   0.135593
 3                                          [CHRYSLER]    93  ...   15    0.16129
 4                                              [JEEP]   196  ...   32   0.163265
 5                                             [BUICK]    99  ...   17   0.171717
 6                                               [GMC]   132  ...   24   0.181818
 7                                                [VW]    68  ...   13   0.191176
 8                                            [TOYOTA]   408  ...   78   0.19

In [None]:
df_chi2 = helper.calc_chi2_cat(df_bin_interval)
df_chi2 ## each value 1 bin, after init

Unnamed: 0,bin,sample_count,bad_count,good_count,bad_rate,bad_count_exp,good_count_exp,chi2,chi2_after_merge_with_left
0,"[BUIUCK, RICART, SEDAN, MERC BENZ, M-B, CHEVRE...",117,0,117,0.0,24.0852,92.9148,30.3285,
1,"[LEXUS, AUDI, PORSCHE, VOLKSWAGEN, MITS, ACURA...",146,12,134,0.0821918,30.055,115.945,13.6578,43.9863
2,"[MERCEDES, OLDSMOBILE, BMW, HONDA, MITSU, LINC...",295,40,255,0.135593,60.7276,234.272,8.9087,22.5665
3,[CHRYSLER],93,15,78,0.16129,19.1446,73.8554,1.12987,10.0386
4,[JEEP],196,32,164,0.163265,40.3479,155.652,2.17486,3.30473
5,[BUICK],99,17,82,0.171717,20.3798,78.6202,0.705796,2.88065
6,[GMC],132,24,108,0.181818,27.173,104.827,0.466569,1.17237
7,[VW],68,13,55,0.191176,13.9982,54.0018,0.0896384,0.556207
8,[TOYOTA],408,78,330,0.191176,83.9894,324.011,0.53783,0.627469
9,"[CHEVY, GEO]",679,131,548,0.192931,139.777,539.223,0.693921,1.23175


In [None]:
df_bin_interval, df_chi2 = helper.chi2_merge(df_chi2)
df_chi2

Unnamed: 0,bin,sample_count,bad_count,good_count,bad_rate,bad_count_exp,good_count_exp,chi2,chi2_after_merge_with_left
0,"[BUIUCK, RICART, SEDAN, MERC BENZ, M-B, CHEVRE...",117,0,117,0.0,24.0852,92.9148,30.3285,
1,"[LEXUS, AUDI, PORSCHE, VOLKSWAGEN, MITS, ACURA...",146,12,134,0.0821918,30.055,115.945,13.6578,43.9863
2,"[MERCEDES, OLDSMOBILE, BMW, HONDA, MITSU, LINC...",295,40,255,0.135593,60.7276,234.272,8.9087,22.5665
3,"[CHRYSLER, JEEP, BUICK, GMC, VW, TOYOTA, CHEVY...",3358,668,2690,0.198928,691.266,2666.73,0.986037,9.89474
4,[FORD],1090,249,841,0.22844,224.383,865.617,3.40066,4.3867
5,"[OLDS, HYUNDAI, SUBARU]",190,45,145,0.236842,39.1127,150.887,1.11587,4.51653
6,"[PONTIAC, ISUZU]",262,68,194,0.259542,53.9344,208.066,4.61905,5.73492
7,"[MAZDA, SUZUKI, DAEWOO, MERC, PLYM, PLY, SAAB,...",141,42,99,0.297872,29.0258,111.974,7.30267,11.9217
8,"[MITSUBISHI, CHRY, HYNDAI, MITISBUSHI, CHEVY G...",70,43,27,0.614286,14.4099,55.5901,71.428,78.7307


In [None]:
df_bin_interval

Unnamed: 0,bin,total,total_rate,bad,bad_rate
0,"[BUIUCK, RICART, SEDAN, MERC BENZ, M-B, CHEVRE...",117,0.0206386,0,0.0
1,"[LEXUS, AUDI, PORSCHE, VOLKSWAGEN, MITS, ACURA...",146,0.0257541,12,0.0821918
2,"[MERCEDES, OLDSMOBILE, BMW, HONDA, MITSU, LINC...",295,0.0520374,40,0.135593
3,"[CHRYSLER, JEEP, BUICK, GMC, VW, TOYOTA, CHEVY...",3358,0.592344,668,0.198928
4,[FORD],1090,0.192274,249,0.22844
5,"[OLDS, HYUNDAI, SUBARU]",190,0.0335156,45,0.236842
6,"[PONTIAC, ISUZU]",262,0.0462163,68,0.259542
7,"[MAZDA, SUZUKI, DAEWOO, MERC, PLYM, PLY, SAAB,...",141,0.0248721,42,0.297872
8,"[MITSUBISHI, CHRY, HYNDAI, MITISBUSHI, CHEVY G...",70,0.0123479,43,0.614286


In [None]:
df_bin_interval = pd.concat([df_na_bin, df_bin_interval], axis=0)

#post processing
total_sample = df_bin_interval.total.sum()
df_bin_interval.total_rate = df_bin_interval.total / total_sample

df_bin_interval = df_bin_interval.sort_values(by=['bad_rate']).reset_index(drop=True)

df_bin_interval = helper.calc_woe(df_bin_interval)

df_bin_interval = df_bin_interval[df_bin_interval.total != 0].reset_index(drop = True)
df_bin_interval

Unnamed: 0,bin,total,total_rate,bad,bad_rate,woe,iv
0,"[BUIUCK, RICART, SEDAN, MERC BENZ, M-B, CHEVRE...",117,0.0200171,0,0.0,32.132485,0.643199
1,"[LEXUS, AUDI, PORSCHE, VOLKSWAGEN, MITS, ACURA...",146,0.0249786,12,0.0821918,2.412933,0.050364
2,"[MERCEDES, OLDSMOBILE, BMW, HONDA, MITSU, LINC...",295,0.0504705,40,0.135593,1.852384,0.068137
3,[SATURN],88,0.0150556,12,0.136364,1.845827,0.020211
4,[nan],49,0.00838323,9,0.183673,1.491655,0.007911
5,"[CHRYSLER, JEEP, BUICK, GMC, VW, TOYOTA, CHEVY...",3358,0.574508,668,0.198928,1.393008,0.481893
6,[FORD],1090,0.186484,249,0.22844,1.217139,0.123276
7,[None],39,0.00667237,9,0.230769,1.203973,0.004326
8,"[OLDS, HYUNDAI, SUBARU]",190,0.0325064,45,0.236842,1.170071,0.020018
9,"[PONTIAC, ISUZU]",262,0.0448246,68,0.259542,1.04835,0.022599


## numerical

In [None]:
helper = VarBinHelper(label="bad_ind")
df_na_bin , df_bin_interval = helper.init_cont(sample_df.tot_tr, sample_df.bad_ind, missing_values = NA_list)

In [None]:
df_na_bin

Unnamed: 0,bin,total,total_rate,bad,bad_rate
0,[nan],361,0.0617622,96,0.265928
1,[-999999],9,0.00153978,0,0.0
2,[-999902],59,0.0100941,11,0.186441
3,[-999901],30,0.00513259,9,0.3
4,[-990001],0,0.0,0,


In [None]:
df_bin_interval

Unnamed: 0_level_0,bin,bin_low,bin_up,total,total_rate,bad,bad_rate
bin_num,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,"(-990000, 1.0]",-990000.0,1.0,150,0.025663,39,0.26
1,"(1.0, 2.0]",1.0,2.0,132,0.0225834,42,0.318182
2,"(2.0, 3.0]",2.0,3.0,150,0.025663,39,0.26
3,"(3.0, 4.0]",3.0,4.0,142,0.0242943,36,0.253521
4,"(4.0, 5.0]",4.0,5.0,170,0.0290847,47,0.276471
5,"(5.0, 6.0]",5.0,6.0,164,0.0280582,41,0.25
6,"(6.0, 7.0]",6.0,7.0,198,0.0338751,68,0.343434
7,"(7.0, 8.0]",7.0,8.0,181,0.0309666,53,0.292818
8,"(8.0, 9.0]",8.0,9.0,170,0.0290847,43,0.252941
9,"(9.0, 10.0]",9.0,10.0,214,0.0366125,53,0.247664


In [None]:
sample_df.tot_tr

0      -999901.0
1      -999901.0
2      -999901.0
3      -999901.0
4      -999901.0
          ...   
5840        21.0
5841         8.0
5842         6.0
5843         9.0
5844        34.0
Name: tot_tr, Length: 5845, dtype: float64

In [None]:
df_all_bin = pd.concat([df_na_bin, df_bin_interval], axis = 0).reset_index(drop = True)
df_mapped = helper.map_bin(sample_df.tot_tr, df_all_bin, inplace = True) ## initial map to both NA and normal bins
df_mapped

Unnamed: 0,tot_tr
0,3.0
1,3.0
2,3.0
3,3.0
4,3.0
...,...
5840,25.0
5841,12.0
5842,10.0
5843,13.0


In [None]:
df_chi2 = helper.calc_chi2(df_mapped, sample_df.bad_ind, df_all_bin[len(df_na_bin):])
df_chi2

Unnamed: 0,bin,bin_low,bin_up,sample_count,bad_count,good_count,bad_rate,bad_count_exp,good_count_exp,chi2,chi2_after_merge_with_left
5,"(-990000, 1.0]",-990000,1.0,150,39,111,0.26,30.7186,119.281,2.80756,
6,"(1.0, 2.0]",1,2.0,132,42,90,0.318182,27.0323,104.968,10.4218,13.2294
7,"(2.0, 3.0]",2,3.0,150,39,111,0.26,30.7186,119.281,2.80756,13.2294
8,"(3.0, 4.0]",3,4.0,142,36,106,0.253521,29.0802,112.92,2.07063,4.87819
9,"(4.0, 5.0]",4,5.0,170,47,123,0.276471,34.8144,135.186,5.36359,7.43422
10,"(5.0, 6.0]",5,6.0,164,41,123,0.25,33.5856,130.414,2.05832,7.42192
11,"(6.0, 7.0]",6,7.0,198,68,130,0.343434,40.5485,157.451,23.3709,25.4292
12,"(7.0, 8.0]",7,8.0,181,53,128,0.292818,37.0671,143.933,8.61235,31.9833
13,"(8.0, 9.0]",8,9.0,170,43,127,0.252941,34.8144,135.186,2.42027,11.0326
14,"(9.0, 10.0]",9,10.0,214,53,161,0.247664,43.8251,170.175,2.41542,4.83569


In [None]:
df_bin_interval, df_chi2 = helper.chi2_merge(df_chi2)
df_bin_interval = df_bin_interval.drop(columns = ['bin_low', 'bin_up'])
df_bin_interval

Unnamed: 0,bin,total,total_rate,bad,bad_rate
0,"(-990000, 1.0]",150,0.02785,39,0.26
1,"(1.0, 2.0]",132,0.024508,42,0.318182
2,"(2.0, 4.0]",292,0.0542146,75,0.256849
3,"(4.0, 6.0]",334,0.0620126,88,0.263473
4,"(6.0, 7.0]",198,0.036762,68,0.343434
5,"(7.0, 25.0]",3183,0.590977,611,0.191957
6,"(25.0, 31.0]",546,0.101374,86,0.157509
7,"(31.0, 32.0]",72,0.013368,5,0.0694444
8,"(32.0, 34.0]",119,0.0220943,15,0.12605
9,"(34.0, inf]",360,0.06684,52,0.144444


In [None]:
df_chi2

Unnamed: 0,bin,bin_low,bin_up,sample_count,bad_count,good_count,bad_rate,bad_count_exp,good_count_exp,chi2,chi2_after_merge_with_left
0,"(-990000, 1.0]",-990000,1.0,150,39,111,0.26,30.7186,119.281,2.80756,
1,"(1.0, 2.0]",1,2.0,132,42,90,0.318182,27.0323,104.968,10.4218,13.2294
2,"(2.0, 4.0]",2,4.0,292,75,217,0.256849,58.606,233.394,5.73746,16.1593
3,"(4.0, 6.0]",4,6.0,334,88,246,0.263473,67.0356,266.964,8.20258,13.94
4,"(6.0, 7.0]",6,7.0,198,68,130,0.343434,40.5485,157.451,23.3709,31.5735
5,"(7.0, 25.0]",7,25.0,3183,611,2572,0.191957,638.846,2544.15,1.5185,24.8894
6,"(25.0, 31.0]",25,31.0,546,86,460,0.157509,109.585,436.415,6.35069,7.86919
7,"(31.0, 32.0]",31,32.0,72,5,67,0.0694444,14.7449,57.2551,8.09901,14.4497
8,"(32.0, 34.0]",32,34.0,119,15,104,0.12605,24.3701,94.6299,4.5305,12.6295
9,"(34.0, inf]",34,inf,360,52,308,0.144444,72.254,287.746,7.10318,11.6337


In [None]:
df_bin_interval = helper.force_monotone(df_bin_interval, force_mono = "one_turn" )
df_bin_interval

Unnamed: 0,bin,total,total_rate,bad,bad_rate
0,"(-990000, 1.0]",150,0.02785,39,0.26
1,"(1.0, 6.0]",758,0.024508,205,0.270449
2,"(6.0, 7.0]",198,0.036762,68,0.343434
3,"(7.0, 25.0]",3183,0.590977,611,0.191957
4,"(25.0, 31.0]",546,0.101374,86,0.157509
5,"(31.0, inf]",551,0.013368,72,0.130672


In [None]:
# post processing to find total rate
total_sample = df_bin_interval.total.sum() + df_na_bin.total.sum()
df_bin_interval.total_rate = df_bin_interval.total / total_sample

## final merge with NA bins
df_bin_interval = pd.concat([df_bin_interval, df_na_bin], axis = 0).reset_index(drop = True) ## final merge with NA bins  

## calculate woe and iv of each bin
df_bin_interval = helper.calc_woe(df_bin_interval)

df_bin_interval

Unnamed: 0,bin,total,total_rate,bad,bad_rate,woe,iv
0,"(-990000, 1.0]",150,0.025663,39,0.26,1.045969,0.012884
1,"(1.0, 6.0]",758,0.129683,205,0.270449,0.992348,0.059082
2,"(6.0, 7.0]",198,0.0338751,68,0.343434,0.648027,0.006874
3,"(7.0, 25.0]",3183,0.544568,611,0.191957,1.437342,0.482229
4,"(25.0, 31.0]",546,0.0934132,86,0.157509,1.676879,0.107297
5,"(31.0, inf]",551,0.0942686,72,0.130672,1.895034,0.131955
6,[nan],361,0.0617622,96,0.265928,1.015382,0.029358
7,[-999999],9,0.00153978,0,0.0,29.567536,0.045527
8,[-999902],59,0.0100941,11,0.186441,1.473306,0.009326
9,[-999901],30,0.00513259,9,0.3,0.847298,0.00174


In [None]:
if type(NA_list) == list:
    df_bin_interval = df_bin_interval[df_bin_interval.total != 0].reset_index(drop = True)
df_bin_interval

Unnamed: 0,bin,total,total_rate,bad,bad_rate,woe,iv
0,"(-990000, 1.0]",150,0.025663,39,0.26,1.045969,0.012884
1,"(1.0, 6.0]",758,0.129683,205,0.270449,0.992348,0.059082
2,"(6.0, 7.0]",198,0.0338751,68,0.343434,0.648027,0.006874
3,"(7.0, 25.0]",3183,0.544568,611,0.191957,1.437342,0.482229
4,"(25.0, 31.0]",546,0.0934132,86,0.157509,1.676879,0.107297
5,"(31.0, inf]",551,0.0942686,72,0.130672,1.895034,0.131955
6,[nan],361,0.0617622,96,0.265928,1.015382,0.029358
7,[-999999],9,0.00153978,0,0.0,29.567536,0.045527
8,[-999902],59,0.0100941,11,0.186441,1.473306,0.009326
9,[-999901],30,0.00513259,9,0.3,0.847298,0.00174


In [None]:
df_bin_interval.total_rate.sum()

0.9999999999999999

# Top-down step by step

## numerical

In [None]:
helper = VarBinHelper(label="bad_ind")
df_na_bin , df_bin_interval = helper.init_cont(sample_df.tot_tr, sample_df.bad_ind, missing_values = [-999999,-999901])

In [None]:
df_na_bin

Unnamed: 0,bin,total,total_rate,bad,bad_rate
0,[nan],361,0.0617622,96,0.265928
1,[-999999],9,0.00153978,0,0.0
2,[-999902.0],59,0.0100941,11,0.186441
3,[-999901],30,0.00513259,9,0.3


In [None]:
df_bin_interval

Unnamed: 0_level_0,bin,bin_low,bin_up,total,total_rate,bad,bad_rate
bin_num,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,"(-990000, 1.0]",-990000.0,1.0,150,0.025663,39,0.26
1,"(1.0, 2.0]",1.0,2.0,132,0.0225834,42,0.318182
2,"(2.0, 3.0]",2.0,3.0,150,0.025663,39,0.26
3,"(3.0, 4.0]",3.0,4.0,142,0.0242943,36,0.253521
4,"(4.0, 5.0]",4.0,5.0,170,0.0290847,47,0.276471
5,"(5.0, 6.0]",5.0,6.0,164,0.0280582,41,0.25
6,"(6.0, 7.0]",6.0,7.0,198,0.0338751,68,0.343434
7,"(7.0, 8.0]",7.0,8.0,181,0.0309666,53,0.292818
8,"(8.0, 9.0]",8.0,9.0,170,0.0290847,43,0.252941
9,"(9.0, 10.0]",9.0,10.0,214,0.0366125,53,0.247664


### cut_top_down()

In [None]:
# inside cut_top_down（ ）

max_bin = 10
min_bin = 4

df_bin_interval["bin_temp"] = 0
df_bin_interval["score"] = 0
df_bin_interval["max_score_if_cut"] = 0
df_bin_interval["keep_cutting"] = 1

keep_cutting = (df_bin_interval["keep_cutting"].sum()>0)

keep_cutting

True

In [None]:
df_bin_interval
## bin_temp column is the current temp bin number, for later cutting

Unnamed: 0_level_0,bin,bin_low,bin_up,total,total_rate,bad,bad_rate,bin_temp,score,max_score_if_cut,keep_cutting
bin_num,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
0,"(-990000, 1.0]",-990000.0,1.0,150,0.025663,39,0.26,0,0,0,1
1,"(1.0, 2.0]",1.0,2.0,132,0.0225834,42,0.318182,0,0,0,1
2,"(2.0, 3.0]",2.0,3.0,150,0.025663,39,0.26,0,0,0,1
3,"(3.0, 4.0]",3.0,4.0,142,0.0242943,36,0.253521,0,0,0,1
4,"(4.0, 5.0]",4.0,5.0,170,0.0290847,47,0.276471,0,0,0,1
5,"(5.0, 6.0]",5.0,6.0,164,0.0280582,41,0.25,0,0,0,1
6,"(6.0, 7.0]",6.0,7.0,198,0.0338751,68,0.343434,0,0,0,1
7,"(7.0, 8.0]",7.0,8.0,181,0.0309666,53,0.292818,0,0,0,1
8,"(8.0, 9.0]",8.0,9.0,170,0.0290847,43,0.252941,0,0,0,1
9,"(9.0, 10.0]",9.0,10.0,214,0.0366125,53,0.247664,0,0,0,1


In [None]:
## while keep_cutting:  ---> commented to show 1 iteration
    
# find unique temp bins
lst_current_bins = df_bin_interval.bin_temp.unique().tolist()

for bin_num_temp in lst_current_bins:
    
    ## only try cutting if this temp bin is labelled keep_cutting == 1
    if df_bin_interval.loc[df_bin_interval.bin_temp == bin_num_temp, "keep_cutting"].sum()>0:
        
        # print("cutting bin ", bin_num_temp)
        df_bin_interval = helper.cut_and_evaluate(df_bin_interval, bin_num_temp)

has_bin_to_cut = (df_bin_interval["keep_cutting"].sum() > 0)
below_max_bin = (df_bin_interval['bin_temp'].nunique() < max_bin)
keep_cutting = ( has_bin_to_cut and below_max_bin )

df_bin_interval

Unnamed: 0_level_0,bin,bin_low,bin_up,total,total_rate,bad,bad_rate,bin_temp,score,max_score_if_cut,keep_cutting
bin_num,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
0,"(-990000, 1.0]",-990000.0,1.0,150,0.025663,39,0.26,1,0.153374,0.153374,1
1,"(1.0, 2.0]",1.0,2.0,132,0.0225834,42,0.318182,1,0.153374,0.153374,1
2,"(2.0, 3.0]",2.0,3.0,150,0.025663,39,0.26,1,0.153374,0.153374,1
3,"(3.0, 4.0]",3.0,4.0,142,0.0242943,36,0.253521,1,0.153374,0.153374,1
4,"(4.0, 5.0]",4.0,5.0,170,0.0290847,47,0.276471,1,0.153374,0.153374,1
5,"(5.0, 6.0]",5.0,6.0,164,0.0280582,41,0.25,1,0.153374,0.153374,1
6,"(6.0, 7.0]",6.0,7.0,198,0.0338751,68,0.343434,1,0.153374,0.153374,1
7,"(7.0, 8.0]",7.0,8.0,181,0.0309666,53,0.292818,1,0.153374,0.153374,1
8,"(8.0, 9.0]",8.0,9.0,170,0.0290847,43,0.252941,1,0.153374,0.153374,1
9,"(9.0, 10.0]",9.0,10.0,214,0.0366125,53,0.247664,1,0.153374,0.153374,1


In [None]:

while keep_cutting:  
    
    # find unique temp bins
    lst_current_bins = df_bin_interval.bin_temp.unique().tolist()

    for bin_num_temp in lst_current_bins:
        
        ## only try cutting if this temp bin is labelled keep_cutting == 1
        if df_bin_interval.loc[df_bin_interval.bin_temp == bin_num_temp, "keep_cutting"].sum()>0:
            
            # print("cutting bin ", bin_num_temp)
            df_bin_interval = helper.cut_and_evaluate(df_bin_interval, bin_num_temp)

    has_bin_to_cut = (df_bin_interval["keep_cutting"].sum() > 0)
    below_max_bin = (df_bin_interval['bin_temp'].nunique() < max_bin)
    keep_cutting = ( has_bin_to_cut and below_max_bin )

df_bin_interval

Unnamed: 0_level_0,bin,bin_low,bin_up,total,total_rate,bad,bad_rate,bin_temp,score,max_score_if_cut,keep_cutting
bin_num,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
0,"(-990000, 1.0]",-990000.0,1.0,150,0.025663,39,0.26,1,0.153374,0.144015,0
1,"(1.0, 2.0]",1.0,2.0,132,0.0225834,42,0.318182,1,0.153374,0.144015,0
2,"(2.0, 3.0]",2.0,3.0,150,0.025663,39,0.26,1,0.153374,0.144015,0
3,"(3.0, 4.0]",3.0,4.0,142,0.0242943,36,0.253521,1,0.153374,0.144015,0
4,"(4.0, 5.0]",4.0,5.0,170,0.0290847,47,0.276471,1,0.153374,0.144015,0
5,"(5.0, 6.0]",5.0,6.0,164,0.0280582,41,0.25,1,0.153374,0.144015,0
6,"(6.0, 7.0]",6.0,7.0,198,0.0338751,68,0.343434,1,0.153374,0.144015,0
7,"(7.0, 8.0]",7.0,8.0,181,0.0309666,53,0.292818,1,0.153374,0.144015,0
8,"(8.0, 9.0]",8.0,9.0,170,0.0290847,43,0.252941,1,0.153374,0.144015,0
9,"(9.0, 10.0]",9.0,10.0,214,0.0366125,53,0.247664,1,0.153374,0.144015,0


In [None]:
while (df_bin_interval['bin_temp'].nunique() < min_bin):
    # if min bin is not satisfied, keep cutting the highest score possible bin
    
    idx = df_bin_interval.max_score_if_cut.idxmax()
    bin_num_temp = df_bin_interval.bin_temp[idx]
    # print("to satisfy min bin, force cutting temp bin: ",bin_num_temp)
    df_bin_interval = helper.cut_and_evaluate(df_bin_interval, bin_num_temp, force_cut = True)

In [None]:
df_bin_interval

Unnamed: 0_level_0,bin,bin_low,bin_up,total,total_rate,bad,bad_rate,bin_temp,score,max_score_if_cut,keep_cutting
bin_num,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
0,"(-990000, 1.0]",-990000.0,1.0,150,0.025663,39,0.26,7,0.113105,0.113105,0
1,"(1.0, 2.0]",1.0,2.0,132,0.0225834,42,0.318182,7,0.113105,0.113105,0
2,"(2.0, 3.0]",2.0,3.0,150,0.025663,39,0.26,7,0.113105,0.113105,0
3,"(3.0, 4.0]",3.0,4.0,142,0.0242943,36,0.253521,7,0.113105,0.113105,0
4,"(4.0, 5.0]",4.0,5.0,170,0.0290847,47,0.276471,7,0.113105,0.113105,0
5,"(5.0, 6.0]",5.0,6.0,164,0.0280582,41,0.25,7,0.113105,0.113105,0
6,"(6.0, 7.0]",6.0,7.0,198,0.0338751,68,0.343434,7,0.113105,0.113105,0
7,"(7.0, 8.0]",7.0,8.0,181,0.0309666,53,0.292818,7,0.113105,0.113105,0
8,"(8.0, 9.0]",8.0,9.0,170,0.0290847,43,0.252941,7,0.113105,0.113105,0
9,"(9.0, 10.0]",9.0,10.0,214,0.0366125,53,0.247664,7,0.113105,0.113105,0


In [None]:
df_group = pd.DataFrame(columns=["bin", "total", "total_rate", "bad", "bad_rate"])
df_aggregate = df_bin_interval.groupby(by=["bin_temp"])["total","bad"].sum().loc[:,["total", "bad"] ].reset_index(drop=True)
df_aggregate

Unnamed: 0,total,bad
0,1347,186
1,1184,187
2,2268,598
3,587,110


In [None]:
df_group.total = df_aggregate.total
df_group.bad = df_aggregate.bad
total_sample = df_group.total.sum()
df_group.total_rate = df_group.total / total_sample
df_group.bad_rate = df_group.bad / df_group.total
df_group

Unnamed: 0,bin,total,total_rate,bad,bad_rate
0,,1347,0.250093,186,0.138085
1,,1184,0.219829,187,0.157939
2,,2268,0.421092,598,0.263668
3,,587,0.108986,110,0.187394


In [None]:
# find the right intervals for each temp bin
ar_left = df_bin_interval.groupby(by=["bin_temp"])["bin_low"].min().tolist()
ar_right = df_bin_interval.groupby(by=["bin_temp"])["bin_up"].max().tolist()
for idx, row in df_group.iterrows():
    df_group.bin[idx] = pd.Interval(left = ar_left[idx], right = ar_right[idx], closed = "right")
df_group

Unnamed: 0,bin,total,total_rate,bad,bad_rate
0,"(23.0, inf]",1347,0.250093,186,0.138085
1,"(16.0, 23.0]",1184,0.219829,187,0.157939
2,"(-990000.0, 13.0]",2268,0.421092,598,0.263668
3,"(13.0, 16.0]",587,0.108986,110,0.187394


In [None]:
ar_left, ar_right

([23.0, 16.0, -990000.0, 13.0], [inf, 23.0, 13.0, 16.0])

In [None]:
df_group = df_group.sort_values(by=['bin']).reset_index(drop=True)
df_group

Unnamed: 0,bin,total,total_rate,bad,bad_rate
0,"(-990000.0, 13.0]",2268,0.421092,598,0.263668
1,"(13.0, 16.0]",587,0.108986,110,0.187394
2,"(16.0, 23.0]",1184,0.219829,187,0.157939
3,"(23.0, inf]",1347,0.250093,186,0.138085


### cut_and_evaluate

In [None]:
helper = VarBinHelper(label="bad_ind")
df_na_bin , df_bin_interval = helper.init_cont(sample_df.tot_tr, sample_df.bad_ind, missing_values = [-999999,-999901])
max_bin = 10
min_bin = 4

df_bin_interval["bin_temp"] = 0
df_bin_interval["score"] = 0
df_bin_interval["max_score_if_cut"] = 0
df_bin_interval["keep_cutting"] = 1

keep_cutting = (df_bin_interval["keep_cutting"].sum()>0)

In [None]:
# def cut_and_evaluate(self, df_bin_interval, bin_num_temp, **kwargs):
## df_bin_temp is df_bin_interval after adding columns in self.top_down_cut()

method = "iv"
force_cut = False
bin_num_temp = 0
best_cut_right, score = helper.find_cut_point(df_bin_interval, bin_num_temp)

best_cut_right, score 

(23, 0.15337399638518912)

In [None]:
df_bin_interval.loc[df_bin_interval.bin_temp == bin_num_temp, "max_score_if_cut"] = score
df_bin_interval

Unnamed: 0_level_0,bin,bin_low,bin_up,total,total_rate,bad,bad_rate,bin_temp,score,max_score_if_cut,keep_cutting
bin_num,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
0,"(-990000, 1.0]",-990000.0,1.0,150,0.025663,39,0.26,0,0,0.153374,1
1,"(1.0, 2.0]",1.0,2.0,132,0.0225834,42,0.318182,0,0,0.153374,1
2,"(2.0, 3.0]",2.0,3.0,150,0.025663,39,0.26,0,0,0.153374,1
3,"(3.0, 4.0]",3.0,4.0,142,0.0242943,36,0.253521,0,0,0.153374,1
4,"(4.0, 5.0]",4.0,5.0,170,0.0290847,47,0.276471,0,0,0.153374,1
5,"(5.0, 6.0]",5.0,6.0,164,0.0280582,41,0.25,0,0,0.153374,1
6,"(6.0, 7.0]",6.0,7.0,198,0.0338751,68,0.343434,0,0,0.153374,1
7,"(7.0, 8.0]",7.0,8.0,181,0.0309666,53,0.292818,0,0,0.153374,1
8,"(8.0, 9.0]",8.0,9.0,170,0.0290847,43,0.252941,0,0,0.153374,1
9,"(9.0, 10.0]",9.0,10.0,214,0.0366125,53,0.247664,0,0,0.153374,1


In [None]:
# decide wether to cut based on score and method
old_score = df_bin_interval.loc[df_bin_interval.bin_temp == bin_num_temp, "score"].iloc[0]
decide_cut_iv_entropy = ( (method=="iv" or method =="entropy") and score > old_score )
decide_cut_chi2 = (method == "chi" and score > self.chimerge_threshold)

decide_cut_chi2, decide_cut_iv_entropy

(False, True)

In [None]:
# score better than before, will cut into 2 parts
if decide_cut_iv_entropy or decide_cut_chi2 or force_cut:
    df_bin_interval.loc[df_bin_interval.bin_temp == bin_num_temp, "score"] = score

    idx_min = df_bin_interval.loc[df_bin_interval.bin_temp == bin_num_temp].index.min()
    idx_max = df_bin_interval.loc[df_bin_interval.bin_temp == bin_num_temp].index.max()

    df_bin_interval.loc[idx_min:best_cut_right-1,"bin_temp"] = bin_num_temp*2 + 1
    df_bin_interval.loc[best_cut_right:idx_max,"bin_temp"] = bin_num_temp*2 + 2
    # print("cutting bin ", bin_num_temp," cut at ", best_cut_right, "score ", score )               

# score no improvement, stop cutting this branch
else:
    df_bin_interval.loc[df_bin_interval.bin_temp == bin_num_temp ,"keep_cutting"] = 0
    # print("stop cutting bin ", bin_num_temp," sum of keep_cutting is ", df_bin_interval.keep_cutting.sum() )
    # score less than before, stop cutting for this temp bin

df_bin_interval

Unnamed: 0_level_0,bin,bin_low,bin_up,total,total_rate,bad,bad_rate,bin_temp,score,max_score_if_cut,keep_cutting
bin_num,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
0,"(-990000, 1.0]",-990000.0,1.0,150,0.025663,39,0.26,1,0.153374,0.153374,1
1,"(1.0, 2.0]",1.0,2.0,132,0.0225834,42,0.318182,1,0.153374,0.153374,1
2,"(2.0, 3.0]",2.0,3.0,150,0.025663,39,0.26,1,0.153374,0.153374,1
3,"(3.0, 4.0]",3.0,4.0,142,0.0242943,36,0.253521,1,0.153374,0.153374,1
4,"(4.0, 5.0]",4.0,5.0,170,0.0290847,47,0.276471,1,0.153374,0.153374,1
5,"(5.0, 6.0]",5.0,6.0,164,0.0280582,41,0.25,1,0.153374,0.153374,1
6,"(6.0, 7.0]",6.0,7.0,198,0.0338751,68,0.343434,1,0.153374,0.153374,1
7,"(7.0, 8.0]",7.0,8.0,181,0.0309666,53,0.292818,1,0.153374,0.153374,1
8,"(8.0, 9.0]",8.0,9.0,170,0.0290847,43,0.252941,1,0.153374,0.153374,1
9,"(9.0, 10.0]",9.0,10.0,214,0.0366125,53,0.247664,1,0.153374,0.153374,1
