# Load data

In [280]:
import pandas as pd
import numpy as np
from time import time
# https://docs.google.com/spreadsheets/d/1LdpV5SxqbSiDta9eB44iisqxdSUNqH1GiwJx9tY2o1I/edit?usp=sharing
import warnings
warnings.filterwarnings("ignore") ##忽略警告

In [281]:
code = "1LdpV5SxqbSiDta9eB44iisqxdSUNqH1GiwJx9tY2o1I"
read_df = pd.read_csv("https://docs.google.com/spreadsheets/d/" + code + "/gviz/tq?tqx=out:csv")


In [282]:
sample_df = read_df.iloc[:,2:24].copy()
lst_cat = ["vehicle_year", "vehicle_make","bankruptcy_ind", "used_ind"] 

In [283]:
# treat tot_tr as having some -99xxxx values
sample_df.tot_tr[0:30] = -999901
sample_df.tot_tr[31:90] = -999902
sample_df.tot_tr[91:100] = -999999
sample_df.tot_tr[100:180] = np.nan
sample_df.tot_tr[200:280] = None

# tot_rev_debt as having 1 -99xxxx value
sample_df.tot_rev_debt[00:30] = -999999
sample_df.tot_rev_debt[31:80] = np.nan

# insert some nan to vehicle make
sample_df.vehicle_make[31:80] = 'nan'
sample_df.vehicle_make[81:120] = None

# treat used_ind as highly skewed data
sample_df.used_ind[0:5000] = 0
sample_df.used_ind[5000:5500] = 1
sample_df.used_ind[5500:5550] = 2
sample_df.used_ind[5550:5600] = 3
sample_df.used_ind[5600:5650] = 4
sample_df.used_ind[5650:5700] = 5
sample_df.used_ind[5700:5750] = 6
sample_df.used_ind[5750:5800] = 7
sample_df.used_ind[5800:] = 8

# treat tot_derog , and age_oldest_tr and the rest as normal continuous feature

sample_df

Unnamed: 0,bad_ind,vehicle_year,vehicle_make,bankruptcy_ind,tot_derog,tot_tr,age_oldest_tr,tot_open_tr,tot_rev_tr,tot_rev_debt,tot_rev_line,rev_util,fico_score,purch_price,msrp,down_pyt,loan_term,loan_amt,ltv,tot_income,veh_mileage,used_ind
0,1,1998.0,FORD,N,7.0,-999901.0,64.0,2.0,1.0,-999999.0,500.0,101,650.0,17200.00,17350.0,0.00,36,17200.00,99.0,6550.00,24000.0,0
1,0,2000.0,DAEWOO,N,0.0,-999901.0,240.0,11.0,7.0,-999999.0,57241.0,60,649.0,19588.54,19788.0,683.54,60,19588.54,99.0,4666.67,22.0,0
2,1,1998.0,PLYMOUTH,N,7.0,-999901.0,60.0,,,-999999.0,,0,613.0,13595.00,11450.0,0.00,60,10500.00,92.0,2000.00,19600.0,0
3,1,1997.0,FORD,N,3.0,-999901.0,35.0,5.0,4.0,-999999.0,5946.0,68,603.0,12999.00,12100.0,3099.00,60,10800.00,118.0,1500.00,10000.0,0
4,0,2000.0,TOYOTA,N,0.0,-999901.0,104.0,2.0,0.0,-999999.0,1800.0,0,764.0,26328.04,22024.0,0.00,60,26328.04,122.0,4144.00,14.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5840,0,1997.0,PORSCHE,N,0.0,21.0,417.0,4.0,2.0,1859.0,52200.0,4,801.0,0.00,31000.0,0.00,36,31000.00,100.0,5000.00,45000.0,8
5841,0,2000.0,TOYOTA,Y,2.0,8.0,62.0,5.0,3.0,4992.0,5066.0,99,628.0,24970.00,22024.0,0.00,60,24970.00,117.0,2400.00,21.0,8
5842,0,1997.0,CHEVROLET,N,0.0,6.0,30.0,4.0,3.0,972.0,5616.0,17,735.0,20949.00,18950.0,0.00,36,20949.00,113.0,1837.50,25000.0,8
5843,0,1999.0,MERCURY,N,0.0,9.0,67.0,7.0,5.0,13714.0,14061.0,98,737.0,22400.00,28700.0,5300.00,48,17100.00,60.0,28000.00,0.0,8


In [284]:
## define some NA values by user, both works
NA_dict = {"vehicle_year":["1998.0"], "vehicle_make":["FORD","B50"]}
NA_list = [-999901,-999902,-999999,-990001]

# Class

In [352]:
## New OO Helper with cleaned code

import os
import pandas as pd
import numpy as np
from time import time as now
from scipy.stats import chi2, chisquare
import math
from sklearn.base import TransformerMixin


class VarBinHelper(TransformerMixin):

    def __init__(self, **kwargs):
        ## initialise the object with name of label column, min_sample, min_bin_num
        self.min_sample = kwargs.get('min_sample', 0.02)
        self.min_bin = kwargs.get('min_bin', 2)
        self.max_bin = kwargs.get('max_bin', 10)
        self.chimerge_threshold = kwargs.get("chimerge_threshold", chi2.ppf(0.95, 1))
        self.label = kwargs.get('label', None)
        self._fit = False
        self.missing_values_found = {} ## will be a dict
        
        ## to make the class interface same as other versions from the team
        self.categorical_features = None ## updated in fit() 
        self.numerical_features = None ## updated in fit() 
        self.woe_encoder = None  ## model
        self.dict_binlist = None ## model.bin_info
        

    def set_chimerge_threshold(self, p=0.95, df=1):
        self.chimerge_threshold = chi2.ppf(p, df)

    def init_cat_bin(self, sr_feature, y, min_sample=0.01, **kwargs):
        ## put each outcome as 1 bin, rank by bad_rate, merge small bins with the neighbor with closest bad_rate
        ## assume all categorical values are string, including year eg. "2020"
        method = kwargs.get('method', "chi_merge")
        min_bin_size = kwargs.get("min_bin_size", 5)
        multi_missing = kwargs.get("multi_missing", False)
        dict_na = kwargs.get('missing_values', {}) 
        merge_category = kwargs.get("merge_category", True)
        init_merge_small_bin = kwargs.get('init_merge_small_bin', True)

        feature_name = sr_feature.name
        if type(dict_na) == list:
            lst_na = dict_na
        else:
            lst_na = dict_na.get(feature_name, ['nan', None]) 
        
        # decide bin_size (min sample in a bin)
        df = pd.concat([sr_feature, y], axis=1)
        if min_sample > 1:  ## find the size of bin
            bin_size = int(max(min_sample, min_bin_size))
        else:
            bin_size = int(max(min_sample * len(sr_feature), min_bin_size))

        # initialise each value as 1 bin
        lst_unique = sr_feature.unique().tolist()
        df_bin_interval = pd.DataFrame(columns=['bin', 'total', 'total_rate', 'bad', 'bad_rate'], index=list(range(len(lst_unique))))
        df_bin_interval.bin = lst_unique

        # calculate total, total_rate, bad, bad_rate for each bin
        for idx, row in df_bin_interval.iterrows():
            row.bin = [row.bin]
            row.total = df[sr_feature.name].isin(df_bin_interval.loc[idx, 'bin']).sum()
            row.total_rate = row.total / len(sr_feature)
            row.bad = len(df.loc[(df[sr_feature.name].isin(row.bin)) & (df[y.name] == 1)])
            row.bad_rate = row.bad / row.total

        # separates NA values as unique bins
        if multi_missing is not None:
            
            ## determine what NA values exist in this series
            if np.nan in lst_na:
                lst_na.remove(np.nan)
            if 'nan' not in lst_na:
                lst_na.append('nan') ## because sr_feature is passed in as df['feature_name].astype(str), we can only find "nan"
            if None not in lst_na:
                lst_na.append(None)

            lst_na_exist = list(set(lst_na) & set(lst_unique)) ## use set interscetion because lst_na might have values not in lst_unique
            self.missing_values_found[feature_name] = lst_na_exist

            if list(set(lst_na) - set(lst_na_exist)):
                print("NA values ", list(set(lst_na) - set(lst_na_exist)), " not found in ", sr_feature.name)
            
            # put NA bins' index in list, use .loc() to extract, then drop them from df_bin_interval
            lst_na_idx = list()
            for na_value in lst_na_exist:  
                lst_na_idx.append(df_bin_interval.loc[df_bin_interval.bin.apply(lambda x: x == [na_value])].index[0])

            df_na_bin = df_bin_interval.loc[lst_na_idx]
                        
            if multi_missing == False and len(lst_na_exist)>0:
                df_temp = pd.DataFrame(columns=['bin', 'total', 'total_rate', 'bad', 'bad_rate'])
                df_temp.bin = [lst_na_exist]
                df_temp.total[0] = df_na_bin.total.sum()
                df_temp.bad[0] = df_na_bin.bad.sum()
                df_temp.total_rate = df_temp.total / len(sr_feature)
                df_temp.bad_rate = df_temp.bad / df_temp.total
                df_na_bin = df_temp

            df_bin_interval = df_bin_interval.drop(index=lst_na_idx)

        df_bin_interval = df_bin_interval.sort_values(by=['bad_rate']).reset_index(drop=True)

        # merge small bins < bin_size for certian methods
        if init_merge_small_bin == True or merge_category == True: # and merge_category == True
            df_bin_interval = self.merge_small_cat_bins(df_bin_interval, bin_size)

        return df_na_bin, df_bin_interval

    def merge_cat_bin(self, df_bin_interval, idx_left, idx_right):
        bin_left = df_bin_interval.loc[idx_left]
        bin_right = df_bin_interval.loc[idx_right]
        bin_left.bad += bin_right.bad
        bin_left.total += bin_right.total
        bin_left.bad_rate = bin_left.bad / bin_left.total
        bin_left.bin += bin_right.bin
        df_bin_interval = df_bin_interval.drop(idx_right).reset_index(drop=True)
        return df_bin_interval
    
    def merge_small_cat_bins(self, df_bin_interval, bin_size):

        ## choose the best neighbor(left vs right) to merge, based on bad_rate similarity
        while df_bin_interval.total.min() < bin_size:
            idx = df_bin_interval.total.astype(int).idxmin()
            if idx == 0:
                ## left most bin, no choice, merge with right neighbor
                df_bin_interval = self.merge_cat_bin(df_bin_interval, idx, idx + 1)
            elif idx == len(df_bin_interval) - 1:
                ## right most bin, merge with left neighbor
                df_bin_interval = self.merge_cat_bin(df_bin_interval, idx - 1, idx)
            else:
                bad_rate = df_bin_interval.bad_rate[idx]
                bad_rate_right = df_bin_interval.bad_rate[idx + 1]
                bad_rate_left = df_bin_interval.bad_rate[idx - 1]
                diff_left = bad_rate - bad_rate_left
                diff_right = bad_rate_right - bad_rate
                merge_right = diff_right < diff_left  ## True False but used as 1 and 0 in the next line, to decide where to merge
                df_bin_interval = self.merge_cat_bin(df_bin_interval, idx - 1 + merge_right, idx + merge_right)
        
        return df_bin_interval

    def calc_chi2_cat(self, df_bin_interval):
        ## only being called once for each feature, find chi2 the first time.
        ## No kwargs to pass in
        total_count = df_bin_interval.total.sum()
        total_bad = df_bin_interval.bad.sum()
        total_good = total_count - total_bad

        ## initialise the df to return
        cols = ["bin", "sample_count", "bad_count", "good_count", "bad_rate", "bad_count_exp",
                "good_count_exp", "chi2", "chi2_after_merge_with_left"]
        df = pd.DataFrame(columns=cols)
        df.bin = df_bin_interval.bin
        df.sample_count = df_bin_interval.total
        df.bad_count = df_bin_interval.bad
        df.bad_rate = df_bin_interval.bad_rate

        ## find chi2 related stats for each bin(row)
        for index, row in df.iterrows():
            row.good_count = row.sample_count - row.bad_count
            row.bad_count_exp = (row.sample_count) / total_count * total_bad
            row.good_count_exp = (row.sample_count) / total_count * total_good
            row.chi2 = chisquare([row.bad_count, row.good_count], f_exp=[row.bad_count_exp, row.good_count_exp])[0]
            if index > 0:
                row.chi2_after_merge_with_left = row.chi2 + df.chi2[index - 1]

        return df

    def init_cont(self, sr_feature, y, **kwargs):

        ## missing value handling --> default is 1 single bin!
        ## min_sample < 1 means each bin has same proprtion (eg. 0.05) of all samples.
        ## min_bin_size -->  optional, dfaut = 5
        ## prioritise min_sample --> is must have
        ## if dont fulfill, error
        ## >1 means each bin has fixed number of samples
        min_sample = kwargs.get("min_sample", self.min_sample)
        min_bin_size = kwargs.get("min_bin_size", 5) # min sample count in a bin
        multi_missing = kwargs.get("multi_missing", False)
        init_method = kwargs.get("init_method", "quantile")
        dict_na = kwargs.get('missing_values', {})
        init_merge_small_bin = kwargs.get('init_merge_small_bin', True)
        feature_name = sr_feature.name
        # sr_feature[sr_feature.isna()] = np.nan ## set all the NAs to np.nan

        if type(dict_na) == list:
            lst_na = dict_na
        else:
            lst_na = dict_na.get(feature_name, []) 
        # print("----- 186",lst_na)
        ## find the size of bin
        if min_sample > 1:  
            bin_size = int(max(min_sample, min_bin_size))
        else:
            bin_size = int(max(min_sample * len(sr_feature), min_bin_size))

        ## sort the varibale for later binning, not using unique values because we are doing same frequency
        sr_feature_sorted = sr_feature.sort_values().reset_index(drop=True).copy()

        ## if choose separate bin for missing value, add np.nan as a bin, and each value <= -99000 as a bin
        if multi_missing is not None:
                    
            if np.nan not in lst_na:
                lst_na.append(np.nan)
            # print("----- 203",lst_na) # if float("nan") not in lst_na:
            #     lst_na.append(float("nan"))
            array_feature_unique = sr_feature_sorted.unique()
            array_possible_na = array_feature_unique[array_feature_unique <= -990000] # eg 990001 990003
            # print("----- 207",array_possible_na)
            
            if dict_na:
                for na_val in array_possible_na.tolist():
                    if na_val not in lst_na:
                        print(na_val," found in feature:",feature_name,", but not specified in missing_values.")

            lst_na = list(set(lst_na).union(set(array_possible_na.tolist())))

            lst_na_lst = list()
            for na_value in lst_na:
                if na_value == "nan" or na_value is None:
                    na_value = np.nan
                    if np.nan in lst_na:
                        continue
                else:
                    lst_na_lst.append([na_value])

            sr_feature_sorted = sr_feature_sorted.dropna()
            sr_feature_sorted = sr_feature_sorted[sr_feature_sorted > -990000].reset_index(drop = True)

        ## find the target count of bins for normal bins
        target_bin_count = len(sr_feature_sorted) / bin_size

        idx = bin_size - 1  ## initialise the running index to look at first cut point
        lst_bin_interval = list()
        lst_bin_up = list()
        lst_bin_low = [-990000]  ## first lower bound is -inf

        ## if unique value is smaller than target_bin_count, each is 1 bin
        if (sr_feature_sorted.nunique() < target_bin_count):
            for cur_val in sr_feature_sorted.unique().tolist():
                lst_bin_interval.append(pd.Interval(left = lst_bin_low[-1:][0], right = cur_val, closed = 'right'))
                lst_bin_up.append(cur_val)
                lst_bin_low.append(cur_val)

        # initialise with equal frequency
        elif init_method == "quantile":
            ##  Start Binning. Jump every <bin_size> in the sorted X array to record cut points
            while idx < len(sr_feature_sorted):
                cur_val = sr_feature_sorted.loc[idx]
                ##  every bin_low is exclusive, bin_up is inclusive, interval like (low,up]
                ## prevent having intervals like (x,x], which is empty bin
                if cur_val in lst_bin_up:
                    ## change idx to point to next new value
                    try: ## only error is when last unique value count is larger then bin_size
                        idx = sr_feature_sorted[sr_feature_sorted > cur_val].index[0]
                        continue
                    except:
                        pass

                if cur_val not in lst_bin_up:
                    if  not math.isnan(cur_val):
                        lst_bin_interval.append(pd.Interval(left = lst_bin_low[-1], right = cur_val, closed = 'right'))
                        lst_bin_up.append(cur_val)
                        lst_bin_low.append(cur_val)
                    
                ## inspect the next value in sr_feature_sorted after <bin_size>
                idx += bin_size

        # initialise with equal distance
        elif init_method == "step":
            len_sr = len(sr_feature_sorted)
            sr_feature_sorted = sr_feature_sorted[ int(0.05*len_sr) : int(0.95*len_sr) ].reset_index(drop=True)## follow book, ignore < 5% and > 95%
            value_min = sr_feature_sorted[0]
            value_max = sr_feature_sorted[len(sr_feature_sorted)-1]
            dist = (value_max-value_min) / (len(sr_feature_sorted) / (bin_size*0.9)) ## (len(sr_feature_sorted) / bin_size) is number of bins to start with
            cur_val = value_min

            # if dist > 0.01:  # round the cut points for simplicity
            #     print("cut ponints will round to 6 dp.")
            #     # dist = round(dist, 6)
            
            # go through each cut point, add to lists
            while (cur_val < value_max*1.001):
                if dist > 0.01:
                    cur_val = round(cur_val, 6)
                lst_bin_interval.append(pd.Interval(left = lst_bin_low[-1], right = cur_val, closed = 'right'))
                lst_bin_up.append(cur_val)
                lst_bin_low.append(cur_val)
                cur_val += dist

            lst_bin_interval.append(pd.Interval(left = lst_bin_low[-1], right = cur_val, closed = 'right'))
            lst_bin_up.append(value_max)
            lst_bin_low.append(value_max)

        ## assume the highest bin is small, merge with 2nd highest bin, set upper bound as inf
        lst_bin_low = lst_bin_low[:-1]
        lst_bin_up[-1] = np.inf
        lst_bin_interval[-1] = pd.Interval(left = lst_bin_low[-1], right = np.inf, closed = 'right')

        ## create the df of normal bins to return
        df_bin_interval = pd.DataFrame(columns= ['bin', 'bin_low', 'bin_up', 'total', 'total_rate', 'bad', 'bad_rate'])
        df_bin_interval.bin = lst_bin_interval
        df_bin_interval.bin_low = lst_bin_low
        df_bin_interval.bin_up = lst_bin_up
        df_bin_interval.index.name = 'bin_num'
        
        # calculate 'total', 'total_rate', 'bad', 'bad_rate'
        for idx, row in df_bin_interval.iterrows():
            df_bin_interval.loc[idx,'total'] = len(sr_feature[(sr_feature > row.bin_low) & (sr_feature <= row.bin_up)])
            df_bin_interval.loc[idx,'total_rate'] = df_bin_interval.loc[idx,'total'] / len(sr_feature)
            df_bin_interval.loc[idx, 'bad'] = len(y[((sr_feature > row.bin.left) & (sr_feature <= row.bin.right)) & y==1])
            if df_bin_interval.loc[idx,'total'] != 0:
                df_bin_interval.loc[idx, 'bad_rate'] = df_bin_interval.loc[idx, 'bad'] / df_bin_interval.loc[idx,'total']
        
        ## merge small bins, since equal distance will have empty/small bins 
        if init_method == "step" or init_merge_small_bin:
            while (df_bin_interval.total.min()<bin_size):
                idx = df_bin_interval.total.astype(int).idxmin()
                if idx == 0:
                    ## left most bin, no choice, merge with right neighbor
                    df_bin_interval = self.merge_cont_bin(df_bin_interval, idx, idx + 1)
                elif idx == len(df_bin_interval) - 1:
                    ## right most bin, merge with left neighbor
                    df_bin_interval = self.merge_cont_bin(df_bin_interval, idx - 1, idx)
                else:
                    bad_rate = df_bin_interval.bad_rate[idx]
                    bad_rate_right = df_bin_interval.bad_rate[idx + 1]
                    bad_rate_left = df_bin_interval.bad_rate[idx - 1]
                    diff_left = bad_rate - bad_rate_left
                    diff_right = bad_rate_right - bad_rate
                    merge_right = diff_right < diff_left  ## True False but used as 1 and 0 in the next line, to decide where to merge
                    df_bin_interval = self.merge_cont_bin(df_bin_interval, idx - 1 + merge_right, idx + merge_right)
        
        ## create the df of NA bins
        df_na_bin = pd.DataFrame(columns = ['bin', 'total', 'total_rate', 'bad', 'bad_rate'])       
        df_na_bin.bin = lst_na_lst
        lst_na_exist = []

        for idx, row in df_na_bin.iterrows():
            row.total = sr_feature.isin(row.bin).sum()
            row.total_rate = row.total / len(sr_feature)
            row.bad = len(y[sr_feature.isin(row.bin) & y==1])
            if row.total != 0:
                row.bad_rate = row.bad / row.total
                lst_na_exist += row.bin
            elif dict_na:
                print(row.bin[0], ", this missing value does not exist in ",feature_name)

        self.missing_values_found[feature_name] = lst_na_exist # update object attribute, for later checking in transform()

        if multi_missing == False:
            df_temp = pd.DataFrame(columns=['bin', 'total', 'total_rate', 'bad', 'bad_rate'])
            if len(lst_na_exist) == 0:
                lst_na_exist = [np.nan]
            df_temp.bin = [lst_na_exist]
            df_temp.total[0] = df_na_bin.total.sum()
            df_temp.bad[0] = df_na_bin.bad.sum()
            df_temp.total_rate = df_temp.total / len(sr_feature)
            df_temp.bad_rate = df_temp.bad / df_temp.total
            df_na_bin = df_temp

        return df_na_bin, df_bin_interval

    def map_bin(self, sr_feature, df_bin_interval, **kwargs):
        ## maps both categorical and numerical x
        ## sr_feature data should be 1 column of series-like
        inplace = kwargs.get("inplace", False)  ## by default will not overwrite sr_feature values, but add a column "bin"
        cat = kwargs.get('cat', False)
        bin_only = kwargs.get('bin_only', None)

        ## df is to record intermediate, will be returned
        var_name = sr_feature.name
        df = pd.DataFrame(sr_feature, columns=[var_name])  
        df[(var_name+'_bin')] = df[var_name]
        lst_bins = []
        cat_count = 0

        ## Mapping starts, iterates by intevals, for categorical, and NA bins of numerical, row.bin is a list, other numerical row.bin is a pd.Interval
        if cat:
            for idx, row in df_bin_interval.iterrows():
                # df.loc[(df[var_name].isin(row.bin)), (var_name+'_bin')] = idx
                if bin_only is None:
                    value = idx
                elif bin_only == True:
                    value = row.bin
                elif bin_only == False:
                    value = row.woe

                df[(var_name+'_bin')] = df[(var_name+'_bin')].replace(row.bin, value)
        else:
            for idx, row in df_bin_interval.iterrows():
                
                if bin_only is None:
                    value = idx
                elif bin_only == True:
                    value = row.bin
                elif bin_only == False:
                    value = row.woe

                if type(row.bin) == pd.Interval:
                    # df.loc[(df[var_name] > row.bin.left) & (df[var_name] <= row.bin.right), (var_name+'_bin')] = idx
                    df[(var_name+'_bin')] = df[(var_name+'_bin')].mask( ((df[var_name] > float(row.bin.left)) & (df[var_name] <= float(row.bin.right) ) ) , value) 
                    lst_bins.append(row.bin.left)

                else:
                    # df.loc[(df[var_name].isin(row.bin)), (var_name+'_bin')] = idx
                    df[(var_name+'_bin')] = df[(var_name+'_bin')].replace(row.bin, value)
                    cat_count += 1
            
            # lst_bins.append(np.inf)
            # df[(var_name+'_bin')] = pd.cut(df[(var_name+'_bin')], bins = lst_bins, labels=False, right=True)+cat_count

        if inplace: 
            df = df.drop(columns=[var_name])
            df.columns = [var_name]

        return df

    def calc_chi2(self, df_mapped, y, df_bin_interval, **kwargs):
        ## deal with both continuous feature, expect X have 2 columns, just the X var + mapping output
        ## df_bin_interval is the output from initialisation (same frequency or same distance)
        label = kwargs.get("label", self.label)
        var_name = df_mapped.columns[0]
        df_mapped = pd.concat([df_mapped, y], axis=1)
        df_mapped.columns = [var_name, label]
        cols = ["bin","bin_low", "bin_up", "sample_count", "bad_count", "good_count", "bad_rate", "bad_count_exp",
                "good_count_exp", "chi2", "chi2_after_merge_with_left"]

        total_bad = df_mapped[label].sum()  ## find the total bad count and good count
        total_good = len(df_mapped) - total_bad

        ## working df, to be returned
        df = pd.DataFrame(columns=cols, index=df_bin_interval.index.astype(int))
        starting_idx = df_bin_interval.index.astype(int).min()
        df.loc[:, ["bin", 'bin_low', 'bin_up']] = df_bin_interval.loc[:, ["bin", 'bin_low', 'bin_up']]

        for idx, row in df.iterrows():
            row.sample_count = len(df_mapped.loc[(df_mapped[var_name] == idx)])
            row.bad_count = len(df_mapped.loc[(df_mapped[var_name] == idx) & (df_mapped[label] == 1)])
            row.good_count = len(df_mapped.loc[(df_mapped[var_name] == idx) & (df_mapped[label] == 0)])
            row.bad_count_exp = (row.sample_count) / len(df_mapped) * total_bad
            row.good_count_exp = (row.sample_count) / len(df_mapped) * total_good
            row.chi2 = chisquare([row.bad_count, row.good_count], f_exp=[row.bad_count_exp, row.good_count_exp])[0]
            if idx >  starting_idx:
                row.chi2_after_merge_with_left = row.chi2 + df.chi2[idx - 1]
            if row.sample_count != 0:
                row.bad_rate = row.bad_count / row.sample_count
            else:
                row.bad_rate = np.nan

        return df

    def merge_pair(self, df_chi2, idx_left, idx_right):  
        ## merge row with idx_left and idx_right, called by chi2_merge(), both cat and continuous

        df = df_chi2  ## will return this df
        count_toal = df.sample_count.sum()
        bad_total = df.bad_count.sum()
        good_total = df.good_count.sum()

        row = df.loc[idx_left]
        next_row = df.loc[idx_right]

        try:
            row.bin_up = next_row.bin_up # assign upper interval, continuous
            row.bin = pd.Interval(left = row.bin.left, right = row.bin_up, closed = 'right')
        except:
            row.bin += next_row.bin # merge list, cat

        row.sample_count += next_row.sample_count
        row.bad_count += next_row.bad_count
        row.good_count += next_row.good_count
        row.bad_count_exp = row.sample_count / count_toal * bad_total
        row.good_count_exp = row.sample_count / count_toal * good_total
        row.chi2 = chisquare(f_obs=[row.bad_count, row.good_count], f_exp=[row.bad_count_exp, row.good_count_exp])[0]

        if row.sample_count != 0:
            row.bad_rate = row.bad_count / row.sample_count
        else:
            row.bad_rate = np.nan

        if idx_left > df.index.min():
            row.chi2_after_merge_with_left = row.chi2 + df.loc[idx_left - 1, 'chi2']  ## the left neighbor of left bin
        if idx_left + 2 < len(df_chi2):
            ## because the second last row does not have index+2 row, update the chi2 if merge with right bin's right neighbor
            df.loc[idx_left + 2, 'chi2_after_merge_with_left'] = row.chi2 + df.loc[idx_left + 2, 'chi2']

        df.loc[idx_left] = row
        return df.drop([idx_right]).reset_index(drop=True)

    def chi2_merge(self, df_chi2, **kwargs):

        chimerge_threshold = kwargs.get("chimerge_threshold", self.chimerge_threshold)
        min_bin = kwargs.get("min_bin", self.min_bin)
        max_bin = kwargs.get("max_bin", self.max_bin)
        ## merge all bins pairs with chi2 < chimerge_threshold, starting with lowest chi1 value
        ## stop when min_bin is reached, or when no more chi2 < critical
        while len(df_chi2) > min_bin:
            sr_chi2 = df_chi2['chi2_after_merge_with_left'][1:]  ## index 0's value is NA, we use index 1 onwards
            idx_min_chi2 = sr_chi2.astype(float).idxmin()
            if df_chi2.loc[idx_min_chi2, 'chi2_after_merge_with_left'] > chimerge_threshold:
                break  ## stop this loop if no more chi2 < threshold
            idx_right = idx_min_chi2
            idx_left = idx_min_chi2 - 1
            df_chi2 = self.merge_pair(df_chi2, idx_left, idx_right)
        
        ## further merge bins if max_bin < current bin count
        if max_bin is not None:  
            while max_bin < len(df_chi2):
                sr_chi2 = df_chi2['chi2_after_merge_with_left'][1:]
                idx_min_chi2 = sr_chi2.astype(float).idxmin()
                idx_right = idx_min_chi2
                idx_left = idx_min_chi2 - 1
                df_chi2 = self.merge_pair(df_chi2, idx_left, idx_right)

        df_bin_interval = df_chi2.drop(
            columns=["good_count", "bad_count_exp", "good_count_exp", "chi2", "chi2_after_merge_with_left"]).copy() ## chi2 intermediate workings are dropped
        df_bin_interval.columns = df_bin_interval.columns.tolist()[:-3] + ["total", 'bad', 'bad_rate'] ## handles both cat and continuous
        df_bin_interval['total_rate'] = df_bin_interval.total / df_bin_interval.total.sum()
        cols = df_bin_interval.columns.tolist()[:-4] + ['total', 'total_rate', 'bad', 'bad_rate'] ## re-order the columns
        df_bin_interval = df_bin_interval[cols].reset_index(drop=True)

        return df_bin_interval, df_chi2

    def find_cut_point(self, df_bin_interval, bin_num_temp, **kwargs ):
        ## df_bin_temp is df_bin_interval after adding columns in self.top_down_cut()
        method = kwargs.get("method","iv")
        df_bin_temp = df_bin_interval.loc[df_bin_interval.bin_temp == bin_num_temp]
        if len(df_bin_temp) == 1:
            return -1, -1
       
        best_cut_right = -1
        score_best = -1
    
        if method=='iv':
            ## try all cut points within the rows in df
            # if 5 bins 0,1,2,3,4 will try cut at 1,2,3,4 bin < cut_point is bin_left. 
            # eg cut_point is 2, left is 0,1, right is 2,3,4
            
            iv_best = -1
            eps = np.finfo(np.float32).eps
            
            for cut_point in range( df_bin_temp.index.min()+1 , df_bin_temp.index.max() ):     
                bin_left = df_bin_temp.loc[:cut_point-1, :]
                bin_right = df_bin_temp.loc[cut_point: ,:]
                # represent the parts in WOE in variables
                good_over_good_total_left = (bin_left.total.sum() - bin_left.bad.sum()) / (df_bin_temp.total.sum() - df_bin_temp.bad.sum())
                good_over_good_total_right = (bin_right.total.sum() - bin_right.bad.sum()) / (df_bin_temp.total.sum() - df_bin_temp.bad.sum())
                bad_over_bad_total_left =  bin_left.bad.sum() / df_bin_temp.bad.sum() 
                bad_over_bad_total_right = bin_right.bad.sum() / df_bin_temp.bad.sum()
                
                ## to give a very high value when good_over_good_total = 0
                woe_left =  np.log( (bad_over_bad_total_left + eps) / (good_over_good_total_right + eps) )  
                woe_right = np.log( ( bad_over_bad_total_right + eps) / (good_over_good_total_right + eps) )

                ## left side iv
                iv = ( bad_over_bad_total_left - good_over_good_total_left ) * woe_left

                ## right side iv
                iv = iv + (bad_over_bad_total_right - good_over_good_total_right) * woe_right
                if iv > iv_best:
                    iv_best = iv
                    best_cut_right = cut_point

            score_best = iv_best

        if method == "chi":
           
            chi2_best = -1
            eps = np.finfo(np.float32).eps
            overall_bad_rate = df_bin_temp.bad.sum() / df_bin_temp.total.sum() 
            overall_good_rate = 1 - overall_bad_rate

            for cut_point in range( df_bin_temp.index.min()+1 , df_bin_temp.index.max() ):     
                bin_left = df_bin_temp.loc[:cut_point-1, :]
                bin_right = df_bin_temp.loc[cut_point: ,:]
                # represent the parts in WOE in variables

                expected_bad_left = bin_left.total.sum() * overall_bad_rate
                expected_good_left = bin_left.total.sum() * overall_good_rate
                good_left = bin_left.total.sum() - bin_left.bad.sum()
                # chi2_left =  ( (bin_left.bad.sum() - expected_bad_left)**2 / expected_bad_left ) + ( (good_left - expected_good_left)**2 / expected_good_left )
                chi2_left = chisquare([bin_left.bad.sum(), good_left], f_exp=[expected_bad_left, expected_good_left])[0]
                
                ## + eps
                expected_bad_right = bin_right.total.sum() * overall_bad_rate
                expected_good_right = bin_right.total.sum() * overall_good_rate
                good_right = bin_right.total.sum() - bin_right.bad.sum()
                # chi2_right =  ( (bin_right.bad.sum() - expected_bad_right)**2 / expected_bad_right ) + ( (good_right - expected_good_right)**2 / expected_good_right )
                chi2_right = chisquare([bin_right.bad.sum(), good_right], f_exp=[expected_bad_right, expected_good_right])[0]
                chi2_total = chi2_left + chi2_right
                # print("line 477 debug chi2_cut, chi2 is ",chi2_total, ", cut point is ",cut_point )
                
                if chi2_total > chi2_best:
                    chi2_best = chi2_total
                    best_cut_right = cut_point
            
            ## update best score
            score_best = chi2_best

        if method == "entropy":
            ent_best = -1
            overall_bad_rate = df_bin_temp.bad.sum() / df_bin_temp.total.sum() 
            overall_good_rate = 1 - overall_bad_rate
            # entropy_total true for all cuts
            total_sample = df_bin_temp.total.sum() 
            entropy_total = 0 - overall_bad_rate * (np.log(overall_bad_rate)) - overall_good_rate * (np.log(overall_good_rate))

            for cut_point in range( df_bin_temp.index.min()+1 , df_bin_temp.index.max() ): 
                bin_left = df_bin_temp.loc[:cut_point-1, :]
                bin_right = df_bin_temp.loc[cut_point: ,:]

                bad_rate_left = bin_left.bad.sum() / bin_left.total.sum()
                good_rate_left = 1 - bad_rate_left
                total_rate_left = bin_left.total.sum() / total_sample

                bad_rate_right = bin_right.bad.sum() / bin_right.total.sum()
                good_rate_right = 1 - bad_rate_right
                total_rate_right = bin_right.total.sum() / total_sample
                
                entropy_conditinal = 0

                entropy_temp_left = 0
                entropy_temp_left -= good_rate_left * np.log( good_rate_left )
                entropy_temp_left -= bad_rate_left * np.log( bad_rate_left )
                entropy_conditinal = entropy_conditinal + total_rate_left * entropy_temp_left

                entropy_temp_right = 0
                entropy_temp_right -= good_rate_right * np.log( good_rate_right )
                entropy_temp_right -= bad_rate_right * np.log( bad_rate_right )
                entropy_conditinal = entropy_conditinal + total_rate_right * entropy_temp_right

                entropy_cut = 1 - (entropy_conditinal / entropy_total)  
                # print("line 519 debug entropy ---- entropy_cut is ", entropy_cut, ", cut point is ",cut_point, " ent cond and ent total is: ", entropy_conditinal, entropy_total )

                if entropy_cut > ent_best:
                    ent_best = entropy_cut
                    best_cut_right = cut_point
            
            ## update best score
            score_best = ent_best

        return best_cut_right, score_best

    def cut_and_evaluate(self, df_bin_interval, bin_num_temp, **kwargs):
         ## df_bin_temp is df_bin_interval after adding columns in self.top_down_cut()
        method = kwargs.get("method", "iv")
        force_cut = kwargs.get("force_cut", False)
        best_cut_right, score = self.find_cut_point(df_bin_interval, bin_num_temp, **kwargs)
        df_bin_interval.loc[df_bin_interval.bin_temp == bin_num_temp, "max_score_if_cut"] = score
        
        # decide wether to cut based on score and method
        old_score = df_bin_interval.loc[df_bin_interval.bin_temp == bin_num_temp, "score"].iloc[0]
        decide_cut_iv_entropy = ( (method=="iv" or method =="entropy") and score > old_score )
        decide_cut_chi2 = (method == "chi" and score > self.chimerge_threshold)  ## and score > old_score ??
        
        # score better than before, will cut into 2 parts
        if decide_cut_iv_entropy or decide_cut_chi2 or force_cut:
            df_bin_interval.loc[df_bin_interval.bin_temp == bin_num_temp, "score"] = score

            idx_min = df_bin_interval.loc[df_bin_interval.bin_temp == bin_num_temp].index.min()
            idx_max = df_bin_interval.loc[df_bin_interval.bin_temp == bin_num_temp].index.max()

            df_bin_interval.loc[idx_min:best_cut_right-1,"bin_temp"] = bin_num_temp*2 + 1
            df_bin_interval.loc[best_cut_right:idx_max,"bin_temp"] = bin_num_temp*2 + 2
            # print("cutting bin ", bin_num_temp," cut at ", best_cut_right, "score ", score )               

        # score no improvement, stop cutting this branch
        else:
            df_bin_interval.loc[df_bin_interval.bin_temp == bin_num_temp ,"keep_cutting"] = 0  
            # print("stop cutting bin ", bin_num_temp," sum of keep_cutting is ", df_bin_interval.keep_cutting.sum() )
            # score less than before, stop cutting for this temp bin
            
        return df_bin_interval     

    
    def cut_top_down(self, df_bin_interval, **kwargs):
        
        max_bin = kwargs.get("max_bin", self.max_bin)
        min_bin = kwargs.get("min_bin", self.min_bin)
        # pretend that all bins are in the same initial temp bin 0
        df_bin_interval["bin_temp"] = 0
        df_bin_interval["score"] = 0
        df_bin_interval["max_score_if_cut"] = 0
        df_bin_interval["keep_cutting"] = 1
        
        keep_cutting = (df_bin_interval["keep_cutting"].sum()>0)
        
        # start looking at each temp bin and cut
        while keep_cutting:
            
            # find unique temp bins
            lst_current_bins = df_bin_interval.bin_temp.unique().tolist()
            for bin_num_temp in lst_current_bins:
                
                ## only try cutting if this temp bin is labelled keep_cutting == 1
                if df_bin_interval.loc[df_bin_interval.bin_temp == bin_num_temp, "keep_cutting"].sum()>0:
                    # print("cutting bin ", bin_num_temp)
                    df_bin_interval = self.cut_and_evaluate(df_bin_interval, bin_num_temp, **kwargs)

            has_bin_to_cut = (df_bin_interval["keep_cutting"].sum() > 0)
            below_max_bin = (df_bin_interval['bin_temp'].nunique() < max_bin)
            keep_cutting = ( has_bin_to_cut and below_max_bin )

        while (df_bin_interval['bin_temp'].nunique() < min_bin):
            # if min bin is not satisfied, keep cutting the highest score possible bin
            idx = df_bin_interval.max_score_if_cut.idxmax()
            bin_num_temp = df_bin_interval.bin_temp[idx]
            # print("to satisfy min bin, force cutting temp bin: ",bin_num_temp)
            df_bin_interval = self.cut_and_evaluate(df_bin_interval, bin_num_temp, force_cut = True, **kwargs)

        
        # merge the temp bins, using pandas aggregate methods
        df_group = pd.DataFrame(columns=["bin", "total", "total_rate", "bad", "bad_rate"])
        df_aggregate = df_bin_interval.groupby(by=["bin_temp"])["total","bad"].sum().loc[:,["total", "bad"] ].reset_index(drop=True)
        df_group.total = df_aggregate.total
        df_group.bad = df_aggregate.bad
        total_sample = df_group.total.sum()
        df_group.total_rate = df_group.total / total_sample
        df_group.bad_rate = df_group.bad / df_group.total

        # find the right intervals for each temp bin
        ar_left = df_bin_interval.groupby(by=["bin_temp"])["bin_low"].min().tolist()
        ar_right = df_bin_interval.groupby(by=["bin_temp"])["bin_up"].max().tolist()
        for idx, row in df_group.iterrows():
            df_group.bin[idx] = pd.Interval(left = ar_left[idx], right = ar_right[idx], closed = "right")

        # sort by bin interval, min to max
        df_group = df_group.sort_values(by=['bin']).reset_index(drop=True)

        return df_group  ## debug only, actual is df_group

    def set_significant_figures(self, sr_feature, unique_range):  # eg (1000,5000)

        if (len(sr_feature.unique()) < unique_range[1]):
            return sr_feature

        decimal_place = 6  # start trying with round(sr_feature, 6) 
        sr_result = sr_feature.copy()

        while (len(sr_result.unique()) > unique_range[1]):
            decimal_place -= 1
            sr_result = round(sr_feature, decimal_place)

        if (len(sr_result.unique()) < unique_range[0]):
            decimal_place += 1
            sr_result = round(sr_feature, decimal_place)

        print(sr_feature.name, " rounded to decimal point: ", decimal_place, "   unique values counts = ",len(sr_result.unique()))
        
        return sr_result
    
    def find_turn_count(self, sr):
        ## function to find the longest monotonically decreasing / increasing bad rates in a list
        ## called by self.force_monotone()
        turn_count = 0
        if len(sr) <= 2:
            return 0

        for idx in range(1, len(sr)-1):
            # if it is a turning point
            if (sr[idx-1] > sr[idx] and sr[idx] < sr[idx+1]) or (sr[idx-1] < sr[idx] and sr[idx] > sr[idx+1]):
                turn_count += 1

        return turn_count

    def merge_cont_bin(self, df_bin_interval, idx_left, idx_right):
        ## simply merging 2 bins without calculating chi2. Useful to handle df_bin_interval after chi2 or other method
        ## called by force_monotone()

        df_copy = df_bin_interval.copy()

        df_copy.loc[idx_left,"bad"] += df_copy.loc[idx_right,"bad"]
        df_copy.loc[idx_left, "total"] += df_copy.loc[idx_right,"total"]
        if 'bin_up' in df_copy.columns.to_list():
            df_copy.loc[idx_left, "bin_up"] = df_copy.loc[idx_right, "bin_up"] 
        df_copy.loc[idx_left, "bad_rate"] = df_copy.loc[idx_left,"bad"] / df_copy.loc[idx_left, "total"]
        df_copy.loc[idx_left, "bin"] = pd.Interval(left = df_copy.loc[idx_left, "bin"].left , right = df_copy.loc[idx_right, "bin"].right, closed = 'right')  
        df_copy = df_copy.drop(idx_right).reset_index(drop=True)
        
        return df_copy

    def choose_turning_point_and_neighbor(self, sr_bad_rate):
        ## called by force_monotone()
        ## sr_bad_rate can also be a pd series
        idx_res_left = 0
        idx_res_right = 1
        min_diff = 1 ## bad_rate is 0~1
        idx_turn = 1

        # find the indexes of the pair with closest bad_rate (from turning points)
        for idx in range (1, len(sr_bad_rate)-1):
            is_up_turn = (sr_bad_rate[idx-1] > sr_bad_rate[idx] and sr_bad_rate[idx] < sr_bad_rate[idx+1])
            is_down_turn = (sr_bad_rate[idx-1] < sr_bad_rate[idx] and sr_bad_rate[idx] > sr_bad_rate[idx+1])
            # if it is a turning point (first occurance)
            if is_up_turn or is_down_turn:
                diff_left = abs(sr_bad_rate[idx-1] - sr_bad_rate[idx])
                diff_right = abs(sr_bad_rate[idx] - sr_bad_rate[idx+1])
                # if difference with left is lower
                if diff_left <= diff_right and diff_left < min_diff:
                    idx_res_left, idx_res_right = idx-1, idx
                    min_diff = diff_left
                    idx_turn = idx
                # if difference with right is lower
                elif diff_left > diff_right and diff_right < min_diff:
                    idx_res_left, idx_res_right = idx, idx+1
                    min_diff = diff_right
                    idx_turn = idx

        # return is outside for loop, to find the minimum of bad_rate differences       
        return idx_res_left, idx_res_right, idx_turn
    
    def force_monotone(self, df_bin_interval, **kwargs):
        ## df_bin_interval has columns bin, total, total_rate, bad, bad_rate
        force_mono = kwargs.get('force_mono', 'u_shape')  ## possible values: 'u_shape', 'mono'
        max_bin = kwargs.get('max_bin', 10) ## need to

        if force_mono == 'u_shape':
            allowed_turns = 1  
        else:
            allowed_turns = 0
        
        ## start merging until allowed number of turning points are reached
        while (self.find_turn_count(df_bin_interval.bad_rate) > allowed_turns):
            
            # find the left and right index to merge
            idx_left, idx_right, idx_turn = self.choose_turning_point_and_neighbor(df_bin_interval.bad_rate)
            
            # print(self.find_turn_count(df_bin_interval.bad_rate)," turns, merging ",idx_left," ", idx_right)
            
            # merge the twn bins
            df_bin_interval = self.merge_cont_bin(df_bin_interval, idx_left, idx_right)

        # if u shape and has 1 last turn
        if force_mono == 'u_shape' and self.find_turn_count(df_bin_interval.bad_rate) == 1:
            idx_left, idx_right, idx_turn = self.choose_turning_point_and_neighbor(df_bin_interval.bad_rate)
            
            # if that turn is at beginning or end
            if idx_turn == 1 or idx_turn == len(df_bin_interval.bad_rate)-2:
            
                # we will merge that with neighbors, untill we see mono
                while (self.find_turn_count(df_bin_interval.bad_rate) > 0):
                    
                    # find the left and right index to merge
                    idx_left, idx_right, idx_turn = self.choose_turning_point_and_neighbor(df_bin_interval.bad_rate)
                    
                    # print(self.find_turn_count(df_bin_interval.bad_rate)," turns, merging ",idx_left," ", idx_right)
                    
                    # merge the twn bins
                    df_bin_interval = self.merge_cont_bin(df_bin_interval, idx_left, idx_right)
        
        return df_bin_interval

    def calc_woe(self, df_bin_interval):
        eps = np.finfo(float).eps
        total_count = df_bin_interval.total.sum()
        total_bad = df_bin_interval.bad.sum()
        total_good = total_count - total_bad

        df_bin_interval['good'] = df_bin_interval['total'] - df_bin_interval['bad']
        df_bin_interval['good_density'] = df_bin_interval['good'] / total_good
        df_bin_interval['bad_density'] = df_bin_interval['bad'] / total_bad
        df_bin_interval['woe'] = np.log((df_bin_interval['good_density'].astype('float64') + eps) / (df_bin_interval['bad_density'].astype('float64') + eps))
        df_bin_interval['iv'] = np.log((df_bin_interval['good_density'].astype('float64') + eps) / (df_bin_interval['bad_density'].astype('float64') + eps)) * (df_bin_interval['good_density'].astype('float64') - df_bin_interval['bad_density'].astype('float64'))
        
        return df_bin_interval.drop(columns=['good', 'good_density', 'bad_density'])

    def fit_single_cont(self, x, y, **kwargs):
        method = kwargs.get("method", "iv")
        force_mono = kwargs.get("force_mono", None)
        max_bin = kwargs.get("max_bin", self.max_bin)

        df_na_bin , df_bin_interval = self.init_cont(sr_feature=x, y=y, **kwargs)
        
        if (df_bin_interval.shape[0] < max_bin):
            ## if bin count after init < max_bin, skip the merging / cutting
            df_bin_interval = df_bin_interval.drop(columns = ['bin_low', 'bin_up'])
            print(x.name, "has limited unique values, count < max_bin, skipped merging / cutting")

        elif method == "chi_merge":
            # bottum up merging
            df_all_bin = pd.concat([df_na_bin, df_bin_interval], axis = 0).reset_index(drop = True)
            df_mapped = self.map_bin(x, df_all_bin, inplace = True) ## initial map to both NA and normal bins
            df_chi2 = self.calc_chi2(df_mapped, y, df_all_bin[len(df_na_bin):], **kwargs)
            df_bin_interval, df_chi2 = self.chi2_merge(df_chi2, **kwargs)
            df_bin_interval = df_bin_interval.drop(columns = ['bin_low', 'bin_up'])

        else: 
            # cutting by iv, chi2, or entropy
            df_bin_interval = self.cut_top_down(df_bin_interval, **kwargs) # high level method of top down cutting
        
        # post processing to find total rate
        total_sample = df_bin_interval.total.sum() + df_na_bin.total.sum()
        df_bin_interval.total_rate = df_bin_interval.total / total_sample

        # force monotone of bad rate
        if force_mono: 
            df_bin_interval = self.force_monotone(df_bin_interval, force_mono = force_mono)
        
        ## final merge with NA bins
        df_bin_interval = pd.concat([df_bin_interval, df_na_bin], axis = 0).reset_index(drop = True) ## final merge with NA bins            

        ## calculate woe and iv of each bin
        df_bin_interval = self.calc_woe(df_bin_interval)

        # drop bins where total count == 0
        dict_na = kwargs.get('missing_values', [])
        if type(dict_na) == list:
            df_bin_interval = df_bin_interval[df_bin_interval.total != 0].reset_index(drop = True)

        return df_bin_interval
    

    def fit_single_cat(self, x, y, **kwargs):
        ## expects x as a series object like df.column or df['column']
        method = kwargs.get("method", "chi_merge")
        max_bin = kwargs.get("max_bin", self.max_bin)
        merge_category = kwargs.get("merge_category", True)
        ## initialise the bins
        df_na_bin, df_bin_interval = self.init_cat_bin(x, y, **kwargs)

        if (df_bin_interval.shape[0] < max_bin):
            ## if bin count after init < max_bin, skip the merging / cutting
            print(x.name, "has limited unique values, count < max_bin, skipped merging")

        elif merge_category == False:
            print(x.name, "is categorical, not merging bins according to user's input merge_category")

        else:
            # if method == 'chi_merge':   ## Cat has only chi_merge
            df_chi2 = self.calc_chi2_cat(df_bin_interval)  
            df_bin_interval, df_chi2 = self.chi2_merge(df_chi2, **kwargs)

        if merge_category:
            # merge categorical bins that have the same bad rates
            while ( df_bin_interval.bad_rate.nunique() < len(df_bin_interval) ):
                ## to find 2 bins that are equal in bad rate, merge
                df_same_badrate = df_bin_interval.groupby("bad_rate").filter(lambda x: len(x) > 1)
                idx_left = df_same_badrate.index[0]
                idx_right = df_same_badrate.index[1] 
                df_bin_interval = self.merge_cat_bin(df_bin_interval, idx_left, idx_right)              
               
        df_bin_interval = pd.concat([df_na_bin, df_bin_interval], axis=0)

        #post processing
        total_sample = df_bin_interval.total.sum()
        df_bin_interval.total_rate = df_bin_interval.total / total_sample

        df_bin_interval = df_bin_interval.sort_values(by=['bad_rate']).reset_index(drop=True)
        
        df_bin_interval = self.calc_woe(df_bin_interval)

        # drop bins where total count == 0, when missing_values is list
        dict_na = kwargs.get('missing_values', [])
        if type(dict_na) == list:
            df_bin_interval = df_bin_interval[df_bin_interval.total != 0].reset_index(drop = True)

        return df_bin_interval


    def fit(self, df_feature, df_label, **kwargs):
        # note: df_label is y in sklearn, it is a series like df.dpd30

        lst_cat_feature = kwargs.get("categorical_features", []) ## default assume 0 categorical features
        label = kwargs.get("label", df_label.name)
        self.label = label
        unique_range = kwargs.get("unique_range", None)
        feature_list = kwargs.get("feature_list", df_feature.columns.tolist())
        lst_excluded_ft = kwargs.get("exclude", [])
        dict_na = kwargs.get("missing_values", {})

        self.numerical_features = list(set(df_feature.columns.tolist()) - set(lst_cat_feature) - set([label]))
        self.categorical_features = lst_cat_feature
        if len(lst_cat_feature) == 0:
            print("no categorical_features list is passed, assuming all features are numerical.")

        lst_bin = list()
        lst_ft = list()
        lst_iscat = list() ## A list of boolean values, storing if a feature is categorical
        
        ## fit features that are categorical
        for feature_name in lst_cat_feature:
            if feature_name not in df_feature.columns.to_list():
                print("------- ",feature_name," in param lst_cat_feature NOT found in Dataframe columns, skipped, please check ----------")
                continue
            if (len(df_feature[feature_name])!=len(df_label)):
                print("fit() skipped for this feature. Please make sure length of x and y are the same for x feature name: ", feature_name)
                continue
            if (feature_name == label) or (feature_name in lst_excluded_ft) or (feature_name not in feature_list):
                continue
            
            print("------- fitting: ",feature_name, " -------")
            ## assume all categorical value is str, also force to str in self.transform()
            sr_x = df_feature[feature_name].astype(str)
            df_bin_interval = self.fit_single_cat(sr_x, df_label, **kwargs) 
            lst_bin.append(df_bin_interval)
            lst_ft.append(feature_name)
            lst_iscat.append(True)

        ## fit features that are continuous
        for feature_name in self.numerical_features:
            if (feature_name == label) or (feature_name in lst_excluded_ft) or (feature_name not in feature_list):
                continue

            print("------- fitting: ",feature_name, " -------")
            sr_x = df_feature[feature_name]
            if unique_range is not None:
                sr_x = self.set_significant_figures(sr_x, unique_range)
            df_bin_interval = self.fit_single_cont(sr_x, df_label, **kwargs)
            lst_bin.append(df_bin_interval)
            lst_ft.append(feature_name)
            lst_iscat.append(False)

        ## prepare return model
        ## return model: df_bin_model has three columns ['feature_name', 'is_cat', 'bin_info']
        ## feature_name is a list of feature names
        ## is_cat indicates whether the feature is categorical(True) or numerical(False)        
        ## bin_info is df with columns [ bin, total, total_rate, bad, bad_rate ]
        df_bin_model = pd.DataFrame(columns=['feature_name', 'is_cat', 'bin_info'])
        df_bin_model['feature_name'] = lst_ft
        df_bin_model['is_cat'] = lst_iscat
        df_bin_model['bin_info'] = lst_bin ## Each bin in bin_info: if categorical, is a list of string values of that bin OR if numerical, is a pd.interval
        
        self.model = df_bin_model
        self._fit = True

        # make a copy of fit() result
        self.model_backup = self.model.copy()

        lst_df = []
        for index, row in self.model.iterrows():
            df_tmp = row['bin_info']
            df_tmp['var'] = row['feature_name']
            df_tmp['is_cat'] = row['is_cat']
            lst_df.append(df_tmp)

        self.woe_encoder = pd.concat(lst_df)

        return self

    def transform(self, df_feature, **kwargs):
        inplace = kwargs.get("inplace", True)
        bin_only = kwargs.get("bin_only", True) ## need to change lower level self.map_bin()
        lst_feature_names = kwargs.get("feature_list", [])  ## default transform all, unless specify the columns
        lst_exclude = kwargs.get("exclude", [])
        dict_na = kwargs.get("missing_values", None)

        if self._fit is False:
            raise ValueError("No model exists, please call self.fit(df_feature, df_label) to fit the model first")

        if dict_na is None:
            print("----- No missing_values list is passed in. -----")

        lst_trans = list()
        for idx, row in self.model.iterrows():
            
            name = row['feature_name']
            if name in lst_exclude:
                continue

            # will transform this feature if user never input feature_list, or the name is in the feature_list
            if (len(lst_feature_names)==0) or (name in lst_feature_names):
                print("----- transforming: ", name," -----")
                
                if dict_na:
                    lst_missing_found = self.missing_values_found.get(name,[])
                    lst_missing_user = dict_na.get(name,[])
                    if set(lst_missing_user) != set(lst_missing_found):
                        print(name," missing values found do not match what is passed.")
                        print("Found but not passed:", set(lst_missing_found)-set(lst_missing_user)," Passed but not found: ", set(lst_missing_user)-set(lst_missing_found))

                if row['is_cat'] == True:
                    # transform single categorical feature
                    df_trans = self.map_bin(df_feature[name].astype(str), row.bin_info, inplace = True, bin_only = bin_only, cat = True)
                    lst_trans.append(df_trans)
                else:
                    df_trans = self.map_bin(df_feature[name], row.bin_info, inplace = True, bin_only = bin_only, cat = False)
                    lst_trans.append(df_trans)

        df = pd.concat(lst_trans, axis = 1)
        df_copy = df_feature.copy()
        df_copy.update(df)
        
        if inplace:
            df_feature.update(df)

        return df_copy

    def evaluate_model_bin_count(self):
        if self._fit is False:
            print("No model yet, please call self.fit() first")
            return
        
        feature_count = self.model.shape[0]

        lst_bin_count = []
        for idx in range(feature_count):
            bin_count = self.model.bin_info[idx].shape[0]
            lst_bin_count.append(bin_count)

        sr_bin_count = pd.Series(lst_bin_count)

        bin_min = sr_bin_count.min()
        bin_max = sr_bin_count.max()
        bin_mean = sr_bin_count.mean()
        print("min, max, mean of bin count is : ", bin_min, " ", bin_max," ", bin_mean)

        self.model["bin_count"] = sr_bin_count

        return self.model

    def set_rules(self, dict_rules, data):  ## user has to pass in data (df), in order to re-calculate bad, total, woe and iv

        if self._fit is False:
            print("No model yet, please call self.fit() first")
            return

        if dict_rules == "recover":
            self.model = self.model_backup.copy()
            dict_rules = {}

        for key in dict_rules:
            
            if key in self.model.feature_name.to_list():
                
                row_feature = self.model.loc[self.model.feature_name == key]
                feature_is_cat = row_feature.is_cat.iloc[0]
                df_bin_interval_user = pd.DataFrame(columns= ['bin', 'total', 'total_rate', 'bad', 'bad_rate'])
                lst_user_bin = dict_rules.get(key) ## it will be a list of lists for cat / list of integers for continuous
                df_bin_info = row_feature.bin_info.iloc[0]

                if  feature_is_cat:

                    lst_cat_values = list()
                    # find all the categorical values of this feature
                    for index, row_bin in df_bin_info.iterrows():
                        lst_cat_values += row_bin.bin
                    
                    lst_user_values = list()
                    # find the set of value that user passed in
                    for lst_one_bin in lst_user_bin:
                        lst_user_values += lst_one_bin

                    lst_values_not_in_dict = list(set(lst_cat_values)-set(lst_user_values))

                    if len(lst_values_not_in_dict) > 0:
                        # append the values that user did not pass in as the last bin
                        lst_user_bin.append(lst_values_not_in_dict)

                    df_bin_interval_user.bin = lst_user_bin
                
                else:
                    # for continuous expect a list like [0,2,4,6,8,12]
                    # default will not expect user to change NA bins
                    lst_na_bins = df_bin_info.loc[ df_bin_info['bin'].map(type) == list, 'bin'].to_list()
                    
                    sr_user_bin = pd.Series(lst_user_bin)
                    
                    # if user passes in something like [[-999900],[-999901,-999902],0,2,4,6,9]
                    if (sr_user_bin.map(type) == list).sum()>0:
                        # we will overwrite the na bins as he wishes
                        lst_na_bins = sr_user_bin[sr_user_bin.map(type) == list].to_list()
                        lst_user_bin = sr_user_bin[sr_user_bin.map(type) != list].to_list()

                    if -990000 not in lst_user_bin:
                        lst_user_bin = [-990000]+lst_user_bin
                    if np.inf not in lst_user_bin:
                        lst_user_bin.append(np.inf)

                    lst_user_bin.sort()

                    lst_bin_low = lst_user_bin[:-1]
                    lst_bin_up = lst_user_bin[1:]

                    lst_bin_interval = list()

                    for i in range(0, len(lst_bin_low)):
                        bin_interval = pd.Interval(left = lst_bin_low[i], right = lst_bin_up[i], closed = 'right')
                        lst_bin_interval.append(bin_interval)        

                    # merge the NA bins
                    lst_bin_interval = lst_bin_interval + lst_na_bins

                    df_bin_interval_user.bin = lst_bin_interval

                ## re-calculate woe , iv .... if there is df passed in, and the feature is found in data
                if (data is not None) and (key in data.columns.to_list()):
                    sr_feature = data[key]
                    y = data[self.label]
                    df = pd.concat([sr_feature, y], axis=1)

                    for idx, row in df_bin_interval_user.iterrows():

                        if type(row.bin) == list:
                            df_bin_interval_user.loc[idx,'total'] = df[sr_feature.name].isin(df_bin_interval_user.loc[idx, 'bin']).sum()
                            df_bin_interval_user.loc[idx, 'bad'] = len(df.loc[(df[sr_feature.name].isin(row.bin)) & (df[y.name] == 1)]) 
                        else:
                            df_bin_interval_user.loc[idx,'total'] = len(sr_feature[(sr_feature > row.bin.left) & (sr_feature <= row.bin.right)])
                            df_bin_interval_user.loc[idx, 'bad'] = len(y[((sr_feature > row.bin.left) & (sr_feature <= row.bin.right)) & y==1])

                        df_bin_interval_user.loc[idx,'total_rate'] = df_bin_interval_user.loc[idx,'total'] / len(sr_feature)
                        if df_bin_interval_user.loc[idx,'total'] != 0:
                            df_bin_interval_user.loc[idx, 'bad_rate'] = df_bin_interval_user.loc[idx, 'bad'] / df_bin_interval_user.loc[idx,'total']

                    df_bin_interval_user = self.calc_woe(df_bin_interval_user)

                # update the bin_info in model
                self.model.bin_info[row_feature.index[0]] = df_bin_interval_user.copy()
                print("updated bins for ",key," , the df is now like:")
                print(self.model.bin_info[row_feature.index])
                print("user bin df is:")
                print(df_bin_interval_user)
            
            # if key not in model.feature_name
            else:
                print("column name ", key, " not in model.feature_name")

        lst_df = []
        for index, row in self.model.iterrows():
            df_tmp = row['bin_info']
            df_tmp['var'] = row['feature_name']
            df_tmp['is_cat'] = row['is_cat']
            lst_df.append(df_tmp)

        self.woe_encoder = pd.concat(lst_df)

        return self

    def drop_empty_missing_bin(self):
        if self._fit is False:
            print("No model yet, please call self.fit() first")
            return
        
        lst_df = []
        
        for idx, row in self.model.iterrows():
            row.bin_info = row.bin_info.loc[row.bin_info.total > 0].reset_index(drop = True)
            df_tmp = row['bin_info']
            df_tmp['var'] = row['feature_name']
            df_tmp['is_cat'] = row['is_cat']
            lst_df.append(df_tmp)

        self.woe_encoder = pd.concat(lst_df)

# High level Demo

## init object

In [286]:
helper = VarBinHelper()
helper.min_bin, helper.max_bin, helper.min_sample, helper.chimerge_threshold, helper.label
## here are some default values for the object

(2, 10, 0.02, 3.841458820694124, None)

In [287]:
## user can manually change them
helper.min_bin = 4
helper.max_bin = 9
helper.min_sample = 0.02
helper.set_chimerge_threshold( p=0.90, df = 1)
 
# 2 params, default p = 0.95 (p value in chi2 test), df = 1 (degree of freedom, we are doing binary tree cuts, so 2 choices, df =1)
# logically, user should only change the p-value. The lower the p-value, the lower chi2
# the lower chimerge_threshold, the less likely 2 bins should merge in "chi_merge", easier to cut in "chi"

helper.label = "dpd30"

helper.min_bin, helper.max_bin, helper.min_sample, helper.chimerge_threshold, helper.label

(4, 9, 0.02, 2.705543454095404, 'dpd30')

## fit( )

In [288]:
## possible params in fit()

#   param name               default         other possible values                 explanation

    # method                 "iv"          "chi", "chi_merge", "entropy"          the method of top-down cutting, or bottom up merging
    # init_method            "quantile"          "step"                           initialisation method
    # min_sample             0.01           can be int >1 or float 0<x<1          minimum sample ration of a bin, when <1, minimum sample count in a bin, when >1
    # min_bin                2               int > 1                              min number of bins
    # max_bin                10              int > 1                              max number of bins
    # missing_values         {}               dict or list                        user's NA list / dict, dict can be for each feature
    # force_mono             None        "u_shape" , "mono" (or any other str)    None is no forcing monotone, "u_shape" is allowing max 1 turn in bad rates, any other str (eg. "mono") is for strictly monotonous
    # unique_range           None           tuple of (int, int)                   None means no change in 精度. the range of unique values allowed for a numerical feature, eg 0.9876543212345 may be turned into 0.9877
    # merge_category         True                   False                         whether a categorical feature will have each value as 1 bin (False will lead to merging of small bins, and chi_merge)
    # multi_missing          False          True or None                          True will make each unique missing value in 1 bin. False will make all in 1 bin. None will allow na bins in merging / cutting, not recommended
    # init_merge_small_bin   True                False                            Default will merge small bins < bin_size, EVEN when the number of unique values in feature is small

    # same as other codes:
        # exclude,  
        # feature_list, 
        # categorical_features

In [289]:
## the case with minimum params
helper_default = VarBinHelper()
helper_default.fit(sample_df, sample_df.bad_ind, categorical_features = lst_cat)
helper_default.woe_encoder

------- fitting:  vehicle_year  -------
NA values  [None]  not found in  vehicle_year
vehicle_year has limited unique values, count < max_bin, skipped merging
------- fitting:  vehicle_make  -------
NA values  [None]  not found in  vehicle_make
------- fitting:  bankruptcy_ind  -------
NA values  [None]  not found in  bankruptcy_ind
bankruptcy_ind has limited unique values, count < max_bin, skipped merging
------- fitting:  used_ind  -------
NA values  [None, 'nan']  not found in  used_ind
used_ind has limited unique values, count < max_bin, skipped merging
------- fitting:  tot_tr  -------
------- fitting:  tot_rev_tr  -------
tot_rev_tr has limited unique values, count < max_bin, skipped merging / cutting
------- fitting:  loan_term  -------
loan_term has limited unique values, count < max_bin, skipped merging / cutting
------- fitting:  tot_income  -------
------- fitting:  down_pyt  -------
------- fitting:  tot_rev_line  -------
------- fitting:  ltv  -------
------- fitting:  loa

Unnamed: 0,bin,total,total_rate,bad,bad_rate,woe,iv,var,is_cat
0,[nan],1,0.000171086,0,0,27.599461,0.005938,vehicle_year,True
1,"[2001.0, 1988.0, 1985.0, 1977.0, 1991.0, 1990....",2118,0.362361,367,0.173277,0.205962,0.014442,vehicle_year,True
2,[0.0],298,0.0509837,62,0.208054,-0.019921,0.000020,vehicle_year,True
3,[1997.0],713,0.121985,150,0.210379,-0.033974,0.000142,vehicle_year,True
4,[1999.0],1045,0.178785,220,0.210526,-0.034863,0.000220,vehicle_year,True
...,...,...,...,...,...,...,...,...,...
1,"(11740.0, inf]",815,0.139435,119,0.146012,0.409608,0.020614,tot_rev_debt,False
2,"[nan, -999999.0]",549,0.0939264,175,0.318761,-0.597149,0.039253,tot_rev_debt,False
0,"(-990000.0, 30357.0]",5220,0.893071,1095,0.20977,-0.030307,0.000828,purch_price,False
1,"(30357.0, 31989.0]",116,0.019846,15,0.12931,0.550452,0.005063,purch_price,False


In [290]:
helper_default.model.bin_info[18] ## tot_tr, the feature with some -99xxxx values

Unnamed: 0,bin,total,total_rate,bad,bad_rate,woe,iv,var,is_cat
0,"(-990000.0, 210.0]",4296,0.734987,993,0.231145,-0.154763,0.018408,age_oldest_tr,False
1,"(210.0, inf]",1333,0.228058,132,0.0990248,0.851489,0.126118,age_oldest_tr,False
2,[nan],216,0.0369547,72,0.333333,-0.663471,0.019353,age_oldest_tr,False


In [292]:
## categorcal values merged into bins where total bin count < max_bin 
## (because merge_category was not in kwargs)
helper_default.model.bin_info[1]

Unnamed: 0,bin,total,total_rate,bad,bad_rate,woe,iv,var,is_cat
0,"[FODGE, RICART, SEDAN, MERC BENZ, M-B, CHEVREL...",117,0.0200171,0,0.0,32.361635,0.814611,vehicle_make,True
1,"[LEXUS, AUDI, PORSCHE, VOLKSWAGEN, MITS, ACURA...",146,0.0249786,12,0.0821918,1.056315,0.019864,vehicle_make,True
2,"[MERCEDES, OLDSMOBILE, BMW, HONDA]",261,0.0446536,35,0.1341,0.508568,0.009858,vehicle_make,True
3,"[SATURN, MITSU, LINCOLN]",122,0.0208725,17,0.139344,0.464128,0.003893,vehicle_make,True
4,"[CHRYSLER, JEEP, BUICK, GMC, VW, TOYOTA, CHEVY...",3062,0.523867,607,0.198236,0.040735,0.000859,vehicle_make,True
5,[nan],345,0.0590248,70,0.202899,0.011657,8e-06,vehicle_make,True
6,"[FORD, None]",1129,0.193157,258,0.228521,-0.139936,0.003939,vehicle_make,True
7,"[OLDS, HYUNDAI, SUBARU]",190,0.0325064,45,0.236842,-0.186547,0.001193,vehicle_make,True
8,"[PONTIAC, ISUZU]",262,0.0448246,68,0.259542,-0.308268,0.004646,vehicle_make,True
9,"[MAZDA, SUZUKI, DAEWOO, MERC, PLYM, PLY, SAAB,...",141,0.0241232,42,0.297872,-0.499168,0.006883,vehicle_make,True


In [333]:
## define some NA values by user, both works
dict_na = {"vehicle_year":["1998.0"], "vehicle_make":["FORD","B50"]}
lst_na = [-999901,-999902,-999999,-990001]

## for example method = chi_merge, force_mono = "mono"
## used NA_dict for missing_values
helper_chi_merge_mono = VarBinHelper()
helper_chi_merge_mono.fit(sample_df, sample_df.bad_ind, categorical_features = lst_cat, method = "chi_merge", 
                          init_method = "step", min_sample = 0.02, min_bin = 3, max_bin = 5, 
                          missing_values = dict_na, force_mono = 'mono', unique_range = (1000,5000), merge_category = True)

------- fitting:  vehicle_year  -------
NA values  [None]  not found in  vehicle_year
------- fitting:  vehicle_make  -------
NA values  [None]  not found in  vehicle_make
------- fitting:  bankruptcy_ind  -------
NA values  [None]  not found in  bankruptcy_ind
bankruptcy_ind has limited unique values, count < max_bin, skipped merging
------- fitting:  used_ind  -------
NA values  [None, 'nan']  not found in  used_ind
used_ind has limited unique values, count < max_bin, skipped merging
------- fitting:  tot_tr  -------
-999999.0  found in feature: tot_tr , but not specified in missing_values.
-999902.0  found in feature: tot_tr , but not specified in missing_values.
-999901.0  found in feature: tot_tr , but not specified in missing_values.
------- fitting:  tot_rev_tr  -------
------- fitting:  loan_term  -------
nan , this missing value does not exist in  loan_term
------- fitting:  tot_income  -------
------- fitting:  down_pyt  -------
nan , this missing value does not exist in  dow

<__main__.VarBinHelper at 0x7f9dd498f6d8>

In [294]:
helper_chi_merge_mono.woe_encoder.loc[helper_chi_merge_mono.woe_encoder["var"] == "tot_tr"] 
## default all missing values in 1 bin

Unnamed: 0,bin,total,total_rate,bad,bad_rate,woe,iv,var,is_cat
0,"(-990000, 7.858632]",1106,0.0482464,312,0.282098,-0.422538,0.037955,tot_tr,False
1,"(7.858632, 32.757818]",3801,0.650299,702,0.184688,0.128283,0.010298,tot_tr,False
2,"(32.757818, inf]",479,0.0819504,67,0.139875,0.459712,0.015017,tot_tr,False
3,"[nan, -999999.0, -999902.0, -999901.0]",459,0.0785287,116,0.252723,-0.272478,0.006298,tot_tr,False


In [295]:
## for example method = chi, force_mono = "u_shape"
## use NA_list for missing_values
helper_chi_u_shape = VarBinHelper()
helper_chi_u_shape.fit(sample_df, sample_df.bad_ind, categorical_features = lst_cat, method = "chi", 
                        init_method = "step", min_sample = 0.005, min_bin = 3, max_bin = 10, force_mono = 'u_shape', 
                        missing_values = lst_na, unique_range = (1000,5000), merge_category = False)

------- fitting:  vehicle_year  -------
NA values  [-999999, -999902, -999901, -990001, None]  not found in  vehicle_year
vehicle_year is categorical, not merging bins according to user's input merge_category
------- fitting:  vehicle_make  -------
NA values  [-999999, -999902, -999901, -990001, None]  not found in  vehicle_make
vehicle_make is categorical, not merging bins according to user's input merge_category
------- fitting:  bankruptcy_ind  -------
NA values  [-999999, -999902, -999901, -990001, None]  not found in  bankruptcy_ind
bankruptcy_ind has limited unique values, count < max_bin, skipped merging
------- fitting:  used_ind  -------
NA values  [-999999, -999902, -999901, None, 'nan', -990001]  not found in  used_ind
used_ind has limited unique values, count < max_bin, skipped merging
------- fitting:  tot_tr  -------
-990001 , this missing value does not exist in  tot_tr
------- fitting:  tot_rev_tr  -------
-999999 , this missing value does not exist in  tot_rev_tr
-9999

<__main__.VarBinHelper at 0x7f9dd431fac8>

In [316]:
# check the object attributes
helper_chi_u_shape.categorical_features

['vehicle_year', 'vehicle_make', 'bankruptcy_ind', 'used_ind']

In [317]:
helper_chi_u_shape.numerical_features

['tot_tr',
 'tot_rev_tr',
 'loan_term',
 'tot_income',
 'down_pyt',
 'tot_rev_line',
 'ltv',
 'loan_amt',
 'tot_open_tr',
 'tot_derog',
 'rev_util',
 'veh_mileage',
 'fico_score',
 'msrp',
 'age_oldest_tr',
 'tot_rev_debt',
 'purch_price']

In [318]:
helper_chi_u_shape.missing_values_found

{'age_oldest_tr': [nan],
 'bankruptcy_ind': ['nan'],
 'down_pyt': [],
 'fico_score': [nan],
 'loan_amt': [],
 'loan_term': [],
 'ltv': [nan],
 'msrp': [nan],
 'purch_price': [],
 'rev_util': [],
 'tot_derog': [nan],
 'tot_income': [nan],
 'tot_open_tr': [nan],
 'tot_rev_debt': [nan, -999999],
 'tot_rev_line': [nan],
 'tot_rev_tr': [nan],
 'tot_tr': [nan, -999999, -999902, -999901],
 'used_ind': [],
 'veh_mileage': [nan],
 'vehicle_make': ['nan'],
 'vehicle_year': ['nan']}

## transform()

In [319]:
sample_df

Unnamed: 0,bad_ind,vehicle_year,vehicle_make,bankruptcy_ind,tot_derog,tot_tr,age_oldest_tr,tot_open_tr,tot_rev_tr,tot_rev_debt,tot_rev_line,rev_util,fico_score,purch_price,msrp,down_pyt,loan_term,loan_amt,ltv,tot_income,veh_mileage,used_ind
0,1,1998.0,FORD,N,7.0,-999901.0,64.0,2.0,1.0,-999999.0,500.0,101,650.0,17200.00,17350.0,0.00,36,17200.00,99.0,6550.00,24000.0,0
1,0,2000.0,DAEWOO,N,0.0,-999901.0,240.0,11.0,7.0,-999999.0,57241.0,60,649.0,19588.54,19788.0,683.54,60,19588.54,99.0,4666.67,22.0,0
2,1,1998.0,PLYMOUTH,N,7.0,-999901.0,60.0,,,-999999.0,,0,613.0,13595.00,11450.0,0.00,60,10500.00,92.0,2000.00,19600.0,0
3,1,1997.0,FORD,N,3.0,-999901.0,35.0,5.0,4.0,-999999.0,5946.0,68,603.0,12999.00,12100.0,3099.00,60,10800.00,118.0,1500.00,10000.0,0
4,0,2000.0,TOYOTA,N,0.0,-999901.0,104.0,2.0,0.0,-999999.0,1800.0,0,764.0,26328.04,22024.0,0.00,60,26328.04,122.0,4144.00,14.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5840,0,1997.0,PORSCHE,N,0.0,21.0,417.0,4.0,2.0,1859.0,52200.0,4,801.0,0.00,31000.0,0.00,36,31000.00,100.0,5000.00,45000.0,8
5841,0,2000.0,TOYOTA,Y,2.0,8.0,62.0,5.0,3.0,4992.0,5066.0,99,628.0,24970.00,22024.0,0.00,60,24970.00,117.0,2400.00,21.0,8
5842,0,1997.0,CHEVROLET,N,0.0,6.0,30.0,4.0,3.0,972.0,5616.0,17,735.0,20949.00,18950.0,0.00,36,20949.00,113.0,1837.50,25000.0,8
5843,0,1999.0,MERCURY,N,0.0,9.0,67.0,7.0,5.0,13714.0,14061.0,98,737.0,22400.00,28700.0,5300.00,48,17100.00,60.0,28000.00,0.0,8


In [320]:
## kwargs:
    # inplace, default: True, will overwrite the df in parameters
                        # if False, the df in params will not be chaneged, only the returned df is updated
                        
    # bin_only, default: True , will overwrite with intervals like(1.5,5] 
                        # if False, will overwrite with woe value of that bin
                        # if None, will overwrite with bin number of that bin

    # feature_list: list of feature names that the user wants to transform, default [], all transform
    
    # exclude: list of feature names that users does not want to transform, default [], no exclude

    # missing_values: dictionary of "featurename" : [ list of missing values ]. default: {}
                        
helper_chi_u_shape.transform(sample_df, inplace = False, bin_only = False) 
                                                            ## will fill with WOE values

----- No missing_values list is passed in. -----
----- transforming:  vehicle_year  -----
----- transforming:  vehicle_make  -----
----- transforming:  bankruptcy_ind  -----
----- transforming:  used_ind  -----
----- transforming:  tot_tr  -----
----- transforming:  tot_rev_tr  -----
----- transforming:  loan_term  -----
----- transforming:  tot_income  -----
----- transforming:  down_pyt  -----
----- transforming:  tot_rev_line  -----
----- transforming:  ltv  -----
----- transforming:  loan_amt  -----
----- transforming:  tot_open_tr  -----
----- transforming:  tot_derog  -----
----- transforming:  rev_util  -----
----- transforming:  veh_mileage  -----
----- transforming:  fico_score  -----
----- transforming:  msrp  -----
----- transforming:  age_oldest_tr  -----
----- transforming:  tot_rev_debt  -----
----- transforming:  purch_price  -----


Unnamed: 0,bad_ind,vehicle_year,vehicle_make,bankruptcy_ind,tot_derog,tot_tr,age_oldest_tr,tot_open_tr,tot_rev_tr,tot_rev_debt,tot_rev_line,rev_util,fico_score,purch_price,msrp,down_pyt,loan_term,loan_amt,ltv,tot_income,veh_mileage,used_ind
0,1,-0.067054,-0.13948,0.0527603,-0.669058,-0.272478,-0.197177,-0.005262,0.015964,-0.597149,-0.724096,-0.814650,-0.673071,0.017627,0.149879,-0.056694,0.608672,-0.012731,-0.132027,0.473909,0.119358,-0.001286
1,0,0.192400,-0.453751,0.0527603,0.554935,-0.272478,0.805308,0.124613,0.192355,-0.597149,1.592594,0.210641,-0.673071,0.158425,0.149879,-0.056694,-0.055962,-0.012731,-0.132027,-0.095375,0.119358,-0.001286
2,1,-0.067054,0.0624656,0.0527603,-0.669058,-0.272478,-0.446059,-0.255185,-0.686892,-0.597149,-0.679121,0.210641,-0.928425,-0.202899,-0.207644,-0.056694,-0.055962,-0.012731,0.331747,-0.095375,0.119358,-0.001286
3,1,-0.033974,-0.13948,0.0527603,-0.176426,-0.272478,-0.710921,0.124613,0.192355,-0.597149,-0.125888,0.210641,-1.285583,-0.202899,-0.207644,-0.056694,-0.055962,-0.012731,-0.498524,-0.095375,0.119358,-0.001286
4,0,0.192400,0.0857652,0.0527603,0.554935,-0.272478,-0.197177,-0.005262,0.015964,-0.597149,-0.417548,0.210641,2.043296,0.158425,0.149879,-0.056694,-0.055962,-0.012731,-0.498524,-0.095375,0.119358,-0.001286
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5840,0,-0.033974,1.43659,0.0527603,0.554935,0.398740,0.805308,0.124613,0.015964,0.012154,1.592594,0.210641,2.043296,-0.202899,0.149879,-0.056694,0.608672,-0.012731,-0.132027,-0.095375,-0.320612,-0.228153
5841,0,0.192400,0.0857652,-0.147803,-0.176426,-0.429971,-0.446059,0.124613,0.192355,0.012154,-0.125888,-0.814650,-0.928425,0.158425,0.149879,-0.056694,-0.055962,-0.012731,-0.498524,-0.095375,0.119358,-0.228153
5842,0,-0.033974,-0.0766599,0.0527603,0.554935,-0.429971,-0.710921,0.124613,0.192355,0.012154,-0.125888,0.210641,1.229409,0.158425,0.149879,-0.056694,0.608672,-0.012731,-0.498524,-0.095375,0.119358,-0.228153
5843,0,-0.034863,-0.0348628,0.0527603,0.554935,-0.190254,-0.197177,0.124613,0.192355,0.386524,0.399224,-0.814650,1.229409,0.158425,0.149879,0.795684,-0.055962,-0.012731,0.887126,0.473909,0.119358,-0.228153


In [321]:
## only transform some columns
## inplace == True, the df in argument is also transformed
## bin_only == True, fill with bin interval / list

## define some NA values by user, both works
dict_na = {"vehicle_year":["1998.0"], "vehicle_make":["FORD","B50"]}
lst_na = [-999901,-999902,-999999,-990001]

sample_df_copy = sample_df.copy()
helper_chi_u_shape.transform(sample_df_copy, inplace = True, bin_only = True, 
                              feature_list = ["vehicle_year", "vehicle_make", "bankruptcy_ind", "purch_price","tot_derog","tot_tr"],
                             missing_values = dict_na)

## if pass missing_values, the model will check the difference btw user's and the model's missing values

----- transforming:  vehicle_year  -----
vehicle_year  missing values found do not match what is passed.
Found but not passed: {'nan'}  Passed but not found:  {'1998.0'}
----- transforming:  vehicle_make  -----
vehicle_make  missing values found do not match what is passed.
Found but not passed: {'nan'}  Passed but not found:  {'FORD', 'B50'}
----- transforming:  bankruptcy_ind  -----
bankruptcy_ind  missing values found do not match what is passed.
Found but not passed: {'nan'}  Passed but not found:  set()
----- transforming:  tot_tr  -----
tot_tr  missing values found do not match what is passed.
Found but not passed: {nan, -999999, -999902, -999901}  Passed but not found:  set()
----- transforming:  tot_derog  -----
tot_derog  missing values found do not match what is passed.
Found but not passed: {nan}  Passed but not found:  set()
----- transforming:  purch_price  -----


Unnamed: 0,bad_ind,vehicle_year,vehicle_make,bankruptcy_ind,tot_derog,tot_tr,age_oldest_tr,tot_open_tr,tot_rev_tr,tot_rev_debt,tot_rev_line,rev_util,fico_score,purch_price,msrp,down_pyt,loan_term,loan_amt,ltv,tot_income,veh_mileage,used_ind
0,1,1998.0,FORD,N,"(3.0, inf]",-999901,64.0,2.0,1.0,-999999.0,500.0,101,650.0,"(15016.64294, 18049.967969]",17350.0,0.00,36,17200.00,99.0,6550.00,24000.0,0
1,0,2000.0,DAEWOO,N,"(-990000.0, 0.0]",-999901,240.0,11.0,7.0,-999999.0,57241.0,60,649.0,"(18049.967969, inf]",19788.0,683.54,60,19588.54,99.0,4666.67,22.0,0
2,1,1998.0,PLYMOUTH,N,"(3.0, inf]",-999901,60.0,,,-999999.0,,0,613.0,"(-990000.0, 15016.64294]",11450.0,0.00,60,10500.00,92.0,2000.00,19600.0,0
3,1,1997.0,FORD,N,"(0.0, 3.0]",-999901,35.0,5.0,4.0,-999999.0,5946.0,68,603.0,"(-990000.0, 15016.64294]",12100.0,3099.00,60,10800.00,118.0,1500.00,10000.0,0
4,0,2000.0,TOYOTA,N,"(-990000.0, 0.0]",-999901,104.0,2.0,0.0,-999999.0,1800.0,0,764.0,"(18049.967969, inf]",22024.0,0.00,60,26328.04,122.0,4144.00,14.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5840,0,1997.0,PORSCHE,N,"(-990000.0, 0.0]","(16.0, inf]",417.0,4.0,2.0,1859.0,52200.0,4,801.0,"(-990000.0, 15016.64294]",31000.0,0.00,36,31000.00,100.0,5000.00,45000.0,8
5841,0,2000.0,TOYOTA,Y,"(0.0, 3.0]","(-990000.0, 8.0]",62.0,5.0,3.0,4992.0,5066.0,99,628.0,"(18049.967969, inf]",22024.0,0.00,60,24970.00,117.0,2400.00,21.0,8
5842,0,1997.0,CHEVROLET,N,"(-990000.0, 0.0]","(-990000.0, 8.0]",30.0,4.0,3.0,972.0,5616.0,17,735.0,"(18049.967969, inf]",18950.0,0.00,36,20949.00,113.0,1837.50,25000.0,8
5843,0,1999.0,MERCURY,N,"(-990000.0, 0.0]","(8.0, 13.0]",67.0,7.0,5.0,13714.0,14061.0,98,737.0,"(18049.967969, inf]",28700.0,5300.00,48,17100.00,60.0,28000.00,0.0,8


In [322]:
sample_df_copy  ## other columns are not changed
## inplace = True, the df in params also updated

Unnamed: 0,bad_ind,vehicle_year,vehicle_make,bankruptcy_ind,tot_derog,tot_tr,age_oldest_tr,tot_open_tr,tot_rev_tr,tot_rev_debt,tot_rev_line,rev_util,fico_score,purch_price,msrp,down_pyt,loan_term,loan_amt,ltv,tot_income,veh_mileage,used_ind
0,1,1998.0,FORD,N,"(3.0, inf]",-999901,64.0,2.0,1.0,-999999.0,500.0,101,650.0,"(15016.64294, 18049.967969]",17350.0,0.00,36,17200.00,99.0,6550.00,24000.0,0
1,0,2000.0,DAEWOO,N,"(-990000.0, 0.0]",-999901,240.0,11.0,7.0,-999999.0,57241.0,60,649.0,"(18049.967969, inf]",19788.0,683.54,60,19588.54,99.0,4666.67,22.0,0
2,1,1998.0,PLYMOUTH,N,"(3.0, inf]",-999901,60.0,,,-999999.0,,0,613.0,"(-990000.0, 15016.64294]",11450.0,0.00,60,10500.00,92.0,2000.00,19600.0,0
3,1,1997.0,FORD,N,"(0.0, 3.0]",-999901,35.0,5.0,4.0,-999999.0,5946.0,68,603.0,"(-990000.0, 15016.64294]",12100.0,3099.00,60,10800.00,118.0,1500.00,10000.0,0
4,0,2000.0,TOYOTA,N,"(-990000.0, 0.0]",-999901,104.0,2.0,0.0,-999999.0,1800.0,0,764.0,"(18049.967969, inf]",22024.0,0.00,60,26328.04,122.0,4144.00,14.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5840,0,1997.0,PORSCHE,N,"(-990000.0, 0.0]","(16.0, inf]",417.0,4.0,2.0,1859.0,52200.0,4,801.0,"(-990000.0, 15016.64294]",31000.0,0.00,36,31000.00,100.0,5000.00,45000.0,8
5841,0,2000.0,TOYOTA,Y,"(0.0, 3.0]","(-990000.0, 8.0]",62.0,5.0,3.0,4992.0,5066.0,99,628.0,"(18049.967969, inf]",22024.0,0.00,60,24970.00,117.0,2400.00,21.0,8
5842,0,1997.0,CHEVROLET,N,"(-990000.0, 0.0]","(-990000.0, 8.0]",30.0,4.0,3.0,972.0,5616.0,17,735.0,"(18049.967969, inf]",18950.0,0.00,36,20949.00,113.0,1837.50,25000.0,8
5843,0,1999.0,MERCURY,N,"(-990000.0, 0.0]","(8.0, 13.0]",67.0,7.0,5.0,13714.0,14061.0,98,737.0,"(18049.967969, inf]",28700.0,5300.00,48,17100.00,60.0,28000.00,0.0,8


In [323]:
helper_chi_u_shape.transform(sample_df, inplace = False, bin_only = None) 
## bin_only = None
# will fill with bin numbers

----- No missing_values list is passed in. -----
----- transforming:  vehicle_year  -----
----- transforming:  vehicle_make  -----
----- transforming:  bankruptcy_ind  -----
----- transforming:  used_ind  -----
----- transforming:  tot_tr  -----
----- transforming:  tot_rev_tr  -----
----- transforming:  loan_term  -----
----- transforming:  tot_income  -----
----- transforming:  down_pyt  -----
----- transforming:  tot_rev_line  -----
----- transforming:  ltv  -----
----- transforming:  loan_amt  -----
----- transforming:  tot_open_tr  -----
----- transforming:  tot_derog  -----
----- transforming:  rev_util  -----
----- transforming:  veh_mileage  -----
----- transforming:  fico_score  -----
----- transforming:  msrp  -----
----- transforming:  age_oldest_tr  -----
----- transforming:  tot_rev_debt  -----
----- transforming:  purch_price  -----


Unnamed: 0,bad_ind,vehicle_year,vehicle_make,bankruptcy_ind,tot_derog,tot_tr,age_oldest_tr,tot_open_tr,tot_rev_tr,tot_rev_debt,tot_rev_line,rev_util,fico_score,purch_price,msrp,down_pyt,loan_term,loan_amt,ltv,tot_income,veh_mileage,used_ind
0,1,6.0,26,0,2.0,4.0,2.0,0.0,0.0,2.0,0.0,2,2.0,1.0,1.0,0.0,0,1.0,2.0,1.0,0.0,4
1,0,2.0,32,0,0.0,4.0,5.0,1.0,1.0,2.0,5.0,0,2.0,2.0,1.0,0.0,1,1.0,2.0,0.0,0.0,4
2,1,6.0,18,0,2.0,4.0,1.0,2.0,2.0,2.0,6.0,0,1.0,0.0,0.0,0.0,1,1.0,1.0,0.0,0.0,4
3,1,4.0,26,0,1.0,4.0,0.0,1.0,1.0,2.0,2.0,0,0.0,0.0,0.0,0.0,1,1.0,3.0,0.0,0.0,4
4,0,2.0,14,0,0.0,4.0,2.0,0.0,0.0,2.0,1.0,0,8.0,2.0,1.0,0.0,1,1.0,3.0,0.0,0.0,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5840,0,4.0,3,0,0.0,3.0,5.0,1.0,0.0,0.0,5.0,0,8.0,0.0,1.0,0.0,0,1.0,2.0,0.0,1.0,6
5841,0,2.0,14,1,1.0,0.0,1.0,1.0,1.0,0.0,2.0,2,1.0,2.0,1.0,0.0,1,1.0,3.0,0.0,0.0,6
5842,0,4.0,24,0,0.0,0.0,0.0,1.0,1.0,0.0,2.0,0,7.0,2.0,1.0,0.0,0,1.0,3.0,0.0,0.0,6
5843,0,5.0,23,0,0.0,1.0,2.0,1.0,1.0,1.0,3.0,2,7.0,2.0,1.0,2.0,1,1.0,0.0,1.0,0.0,6


## set_rules()

In [324]:
helper_chi_u_shape

<__main__.VarBinHelper at 0x7f9dd431fac8>

In [325]:
## save the trained object
import pickle
pickle.dump( helper_chi_u_shape, open( "save.p", "wb" ) )

## helper_chi_u_shape = pickle.load( open( "save.p", "rb" ) )

In [326]:
dict_rule = {"tot_rev_debt":[0,2000,4000,6000,10000], 
             "vehicle_make":[['nan'],['FORD','LEXUS'],['B50'],['SUZUKI']],
             "tot_tr":[0,3,5,9,12,15]
             }

In [327]:
## params, only 2 and must have 2
    ## dict_rules : python dictionary like the example above, user can input 1 or many feature names
        # if dict_rules = "recover", will overwrite self.model with self.model_backup
            # in this case , we can use data = None

    ## data: pandas df, logically, it should be the same train data as in fit( )
    
helper_chi_u_shape.set_rules(dict_rules = dict_rule, data = sample_df)

updated bins for  tot_rev_debt  , the df is now like:
19                  bin total total_rate  bad  bad_r...
Name: bin_info, dtype: object
user bin df is:
              bin total total_rate  bad  bad_rate       woe        iv
0    (-990000, 0]   304  0.0520103   50  0.164474  0.268693  0.003460
1       (0, 2000]  1845   0.315654  399   0.21626 -0.069024  0.001535
2    (2000, 4000]   848   0.145081  172   0.20283  0.012080  0.000021
3    (4000, 6000]   539  0.0922156   93  0.172542  0.211101  0.003855
4   (6000, 10000]   783   0.133961  162  0.206897 -0.012884  0.000022
5    (10000, inf]   977   0.167151  146  0.149437  0.382405  0.021726
6  [nan, -999999]   549  0.0939264  175  0.318761 -0.597149  0.039253
updated bins for  vehicle_make  , the df is now like:
1                                                  ...
Name: bin_info, dtype: object
user bin df is:
                                                 bin total  ...        woe        iv
0                                           

<__main__.VarBinHelper at 0x7f9dd431fac8>

In [328]:
trans_df = helper_chi_u_shape.transform(sample_df, inplace = False, bin_only = True,  feature_list = ["tot_tr"])

## tot_tr now following user's bin
trans_df.tot_tr.value_counts()

----- No missing_values list is passed in. -----
----- transforming:  tot_tr  -----


(15, inf]       2717
(5, 9]           713
(9, 12]          621
(12, 15]         591
(0, 3]           413
(3, 5]           312
-999902           59
-999901           30
(-990000, 0]      19
-999999            9
Name: tot_tr, dtype: int64

In [329]:
## to restore rules found by fit(), simply pass dict_rules = "recover"
helper_chi_u_shape.set_rules(dict_rules = "recover", data = None)

trans_df = helper_chi_u_shape.transform(sample_df, inplace = False, bin_only = True, feature_list = ["tot_tr"])

trans_df.tot_tr.value_counts()
## tot_tr now following original bin

----- No missing_values list is passed in. -----
----- transforming:  tot_tr  -----


(16.0, inf]         2531
(-990000.0, 8.0]    1287
(8.0, 13.0]          981
(13.0, 16.0]         587
-999902               59
-999901               30
-999999                9
Name: tot_tr, dtype: int64

## others

drop_empty_missing_bin()

evaluate_model_bin_count()


In [334]:
helper_chi_merge_mono.woe_encoder.loc[helper_chi_merge_mono.woe_encoder["var"] == "purch_price"]
## when missing_values is a dict, even a feature has no missing values, it will have an empty bin  [nan]
## call self.drop_empty_missing_bin() to drop those

Unnamed: 0,bin,total,total_rate,bad,bad_rate,woe,iv,var,is_cat
0,"(-990000, 11549.985764]",1221,0.0817793,297,0.243243,-0.221639,0.010932,purch_price,False
1,"(11549.985764, inf]",4624,0.791104,900,0.194637,0.06354,0.003134,purch_price,False
2,[nan],0,0.0,0,,0.0,0.0,purch_price,False


In [335]:
helper_chi_merge_mono.drop_empty_missing_bin()

In [336]:
helper_chi_merge_mono.woe_encoder.loc[helper_chi_merge_mono.woe_encoder["var"] == "purch_price"]


Unnamed: 0,bin,total,total_rate,bad,bad_rate,woe,iv,var,is_cat
0,"(-990000, 11549.985764]",1221,0.0817793,297,0.243243,-0.221639,0.010932,purch_price,False
1,"(11549.985764, inf]",4624,0.791104,900,0.194637,0.06354,0.003134,purch_price,False


In [337]:
helper_chi_u_shape.evaluate_model_bin_count()
# this method returns the object's model, with all featured fitted, and min, max, ave bin count

min, max, mean of bin count is :  2   35   6.095238095238095


Unnamed: 0,feature_name,is_cat,bin_info,bin_count
0,vehicle_year,True,...,12
1,vehicle_make,True,...,35
2,bankruptcy_ind,True,bin total total_rate bad ... woe...,3
3,used_ind,True,bin total total_rate bad bad_rate ...,9
4,tot_tr,False,bin total tota...,5
5,tot_rev_tr,False,bin total total_rate ... ...,3
6,loan_term,False,bin total total_rate ... ...,2
7,tot_income,False,bin total ... ...,3
8,down_pyt,False,bin total total_rate...,3
9,tot_rev_line,False,bin total total_rate...,7


In [338]:
helper_chi_u_shape.model.bin_info[18]
## to view a feature's bins

Unnamed: 0,bin,total,total_rate,bad,bad_rate,woe,iv,var,is_cat
0,"(-990000.0, 39.75841]",596,0.101967,205,0.34396,-0.710921,0.061949,age_oldest_tr,False
1,"(39.75841, 63.63275]",488,0.0834902,140,0.286885,-0.446059,0.018774,age_oldest_tr,False
2,"(63.63275, 121.61329]",1202,0.205646,287,0.238769,-0.197177,0.00846,age_oldest_tr,False
3,"(121.61329, 162.54073]",1199,0.205133,239,0.199333,0.033851,0.000233,age_oldest_tr,False
4,"(162.54073, 186.41507]",468,0.0800684,81,0.173077,0.207357,0.003233,age_oldest_tr,False
5,"(186.41507, inf]",1676,0.286741,173,0.103222,0.805308,0.144019,age_oldest_tr,False
6,[nan],216,0.0369547,72,0.333333,-0.663471,0.019353,age_oldest_tr,False


In [339]:
helper_chi_u_shape.woe_encoder.loc[helper_chi_u_shape.woe_encoder["var"] == "tot_tr"]
## these 2 lines shows same df, for the bins of 1 feature

Unnamed: 0,bin,total,total_rate,bad,bad_rate,woe,iv,var,is_cat
0,"(-990000.0, 8.0]",1287,0.220188,365,0.283605,-0.429971,0.045819,tot_tr,False
1,"(8.0, 13.0]",981,0.167836,233,0.237513,-0.190254,0.006416,tot_tr,False
2,"(13.0, 16.0]",587,0.100428,110,0.187394,0.110418,0.001185,tot_tr,False
3,"(16.0, inf]",2531,0.43302,373,0.147373,0.39874,0.060877,tot_tr,False
4,"[nan, -999999, -999902, -999901]",459,0.0785287,116,0.252723,-0.272478,0.006298,tot_tr,False


# Diffrent fit() Params Demo

## init_method

In [340]:
## init_method = "quantile"
df_na, df_int = helper.init_cont(sample_df.tot_rev_debt, sample_df.bad_ind, init_method = "quantile", min_bin_size = 5, min_samples = 0.005)

In [341]:
df_na

Unnamed: 0,bin,total,total_rate,bad,bad_rate
0,"[nan, -999999.0]",549,0.0939264,175,0.318761


In [342]:
df_int

Unnamed: 0,bin,bin_low,bin_up,total,total_rate,bad,bad_rate
0,"(-990000, 1.0]",-990000.0,1.0,305,0.0520103,50,0.163934
1,"(1.0, 132.0]",1.0,132.0,233,0.0203593,43,0.184549
2,"(132.0, 296.0]",132.0,296.0,231,0.0196749,48,0.207792
3,"(296.0, 393.0]",296.0,393.0,116,0.019846,28,0.241379
4,"(393.0, 482.0]",393.0,482.0,117,0.0200171,22,0.188034
5,"(482.0, 685.0]",482.0,685.0,232,0.0196749,63,0.271552
6,"(685.0, 965.0]",685.0,965.0,231,0.0196749,41,0.177489
7,"(965.0, 1086.0]",965.0,1086.0,117,0.0200171,23,0.196581
8,"(1086.0, 1255.0]",1086.0,1255.0,116,0.019846,24,0.206897
9,"(1255.0, 1600.0]",1255.0,1600.0,232,0.0196749,56,0.241379


In [343]:
## init_method = "step"
df_na, df_int = helper.init_cont(sample_df.tot_rev_debt, sample_df.bad_ind, init_method = "step", min_bin_size = 5, min_samples = 0.005)

In [344]:
df_na

Unnamed: 0,bin,total,total_rate,bad,bad_rate
0,"[nan, -999999.0]",549,0.0939264,175,0.318761


In [345]:
df_int  ## step will give fewer bins after init, since there are many small / empty bins need to merge

Unnamed: 0,bin,bin_low,bin_up,total,total_rate,bad,bad_rate
0,"(-990000, 0.0]",-990000.0,0.0,304,0.0520103,50,0.164474
1,"(0.0, 505.443172]",0.0,505.4432,735,0.125749,152,0.206803
2,"(505.443172, 1010.886344]",505.443172,1010.886,473,0.0809239,102,0.215645
3,"(1010.886344, 1516.329516]",1010.886344,1516.33,346,0.0591959,77,0.222543
4,"(1516.329516, 2021.772688]",1516.329516,2021.773,302,0.0516681,69,0.228477
5,"(2021.772688, 2527.21586]",2021.772688,2527.216,263,0.0449957,50,0.190114
6,"(2527.21586, 3032.659032]",2527.21586,3032.659,237,0.0405475,65,0.274262
7,"(3032.659032, 3538.102204]",3032.659032,3538.102,174,0.029769,26,0.149425
8,"(3538.102204, 4043.545376]",3538.102204,4043.545,187,0.0319932,35,0.187166
9,"(4043.545376, 4548.988548]",4043.545376,4548.989,150,0.025663,26,0.173333


## merge_category

In [353]:
helper = VarBinHelper(label="bad_ind")
helper.fit_single_cat(sample_df.vehicle_make, sample_df.bad_ind, merge_category = True )
## will merge cat values into bins using chi2 merge

Unnamed: 0,bin,total,total_rate,bad,bad_rate,woe,iv
0,"[FODGE, RICART, SEDAN, MERC BENZ, M-B, CHEVREL...",117,0.0200171,0,0.0,32.361635,0.8146109
1,"[LEXUS, AUDI, PORSCHE, VOLKSWAGEN, MITS, VOLKS...",146,0.0249786,12,0.0821918,1.056315,0.01986351
2,"[MERCEDES, BMW, OLDSMOBILE, HONDA]",261,0.0446536,35,0.1341,0.508568,0.009857732
3,"[SATURN, MITSU, LINCOLN]",122,0.0208725,17,0.139344,0.464128,0.003893197
4,"[CHRYSLER, JEEP, BUICK, GMC, TOYOTA, VW, CHEVY...",3358,0.574508,668,0.198928,0.03639,0.0007526019
5,"[None, nan]",88,0.0150556,18,0.204545,0.001505,3.408124e-08
6,[FORD],1090,0.186484,249,0.22844,-0.13948,0.003777394
7,"[OLDS, HYUNDAI, SUBARU]",190,0.0325064,45,0.236842,-0.186547,0.001193487
8,"[PONTIAC, ISUZU]",262,0.0448246,68,0.259542,-0.308268,0.004645695
9,"[MAZDA, SUZUKI, DAEWOO, MERC, PLYM, PLY, SAAB,...",141,0.0241232,42,0.297872,-0.499168,0.006882651


In [355]:
helper.fit_single_cat(sample_df.vehicle_make, sample_df.bad_ind, merge_category = False )
## each cat value is 1 bin
## but if 1 value too small, it will get merged
## that is, min_sample takes priority

vehicle_make is categorical, not merging bins according to user's input merge_category


Unnamed: 0,bin,total,total_rate,bad,bad_rate,woe,iv
0,"[FODGE, RICART, SEDAN, MERC BENZ, M-B, CHEVREL...",117,0.0200171,0,0.0,32.361635,0.8146109
1,"[LEXUS, AUDI, PORSCHE, VOLKSWAGEN, MITS, VOLKS...",146,0.0249786,12,0.0821918,1.056315,0.01986351
2,"[MERCEDES, BMW, OLDSMOBILE, HONDA]",261,0.0446536,35,0.1341,0.508568,0.009857732
3,"[SATURN, MITSU, LINCOLN]",122,0.0208725,17,0.139344,0.464128,0.003893197
4,[CHRYSLER],93,0.015911,15,0.16129,0.29204,0.001241194
5,[JEEP],196,0.0335329,32,0.163265,0.277512,0.002372864
6,[BUICK],99,0.0169376,17,0.171717,0.216887,0.0007460543
7,[GMC],132,0.0225834,24,0.181818,0.147459,0.0004697558
8,[TOYOTA],408,0.0698033,78,0.191176,0.085765,0.000500472
9,[VW],68,0.0116339,13,0.191176,0.085765,8.3412e-05


In [361]:
helper.init_cat_bin(sample_df.vehicle_make, sample_df.bad_ind, merge_category = False, init_merge_small_bin = False)[1]
## each cat value is 1 bin

Unnamed: 0,bin,total,total_rate,bad,bad_rate
0,[FODGE],1,0.000171086,0,0
1,[RICART],1,0.000171086,0,0
2,[SEDAN],1,0.000171086,0,0
3,[MERC BENZ],2,0.000342173,0,0
4,[M-B],1,0.000171086,0,0
...,...,...,...,...,...
150,[VE],1,0.000171086,1,1
151,[SUK],1,0.000171086,1,1
152,[CHYRLER],1,0.000171086,1,1
153,[HUUNDAI],1,0.000171086,1,1


## force_mono

In [None]:
helper = VarBinHelper()
helper.fit_single_cont(sample_df.tot_tr, sample_df.bad_ind, method = "chi", min_bin = 8, init_method = "quantile", force_mono=None)
# force_mono=None, no monotonous forcing

Unnamed: 0,bin,total,total_rate,bad,bad_rate,woe,iv
0,"(-990000.0, 8.0]",1287,0.220188,365,0.283605,0.926648,0.088305
1,"(8.0, 13.0]",981,0.167836,233,0.237513,1.166365,0.102768
2,"(13.0, 16.0]",587,0.100428,110,0.187394,1.467036,0.092113
3,"(16.0, 19.0]",540,0.0923867,79,0.146296,1.76395,0.115283
4,"(19.0, 23.0]",644,0.11018,108,0.167702,1.602003,0.117307
5,"(23.0, 24.0]",123,0.0210436,11,0.0894309,2.320604,0.040099
6,"(24.0, 26.0]",228,0.0390077,28,0.122807,1.966113,0.057857
7,"(26.0, inf]",996,0.170402,147,0.14759,1.753627,0.210615
8,[nan],361,0.0617622,96,0.265928,1.015382,0.029358
9,[-999999.0],9,0.00153978,0,0.0,29.567536,0.045527


In [None]:
helper.fit_single_cont(sample_df.tot_tr, sample_df.bad_ind, method = "chi", min_bin = 8, init_method = "quantile", force_mono="u_shape")
# force_mono="u_shape", allows maximumly 1 turning point in bad_rate

Unnamed: 0,bin,total,total_rate,bad,bad_rate,woe,iv
0,"(-990000.0, 8.0]",1287,0.220188,365,0.283605,0.926648,0.088305
1,"(8.0, 13.0]",981,0.167836,233,0.237513,1.166365,0.102768
2,"(13.0, 16.0]",587,0.100428,110,0.187394,1.467036,0.092113
3,"(16.0, 23.0]",1184,0.0923867,187,0.157939,1.673642,0.10578
4,"(23.0, 24.0]",123,0.0210436,11,0.0894309,2.320604,0.040099
5,"(24.0, 26.0]",228,0.0390077,28,0.122807,1.966113,0.057857
6,"(26.0, inf]",996,0.170402,147,0.14759,1.753627,0.210615
7,[nan],361,0.0617622,96,0.265928,1.015382,0.029358
8,[-999999.0],9,0.00153978,0,0.0,29.567536,0.045527
9,[-999902.0],59,0.0100941,11,0.186441,1.473306,0.009326


In [None]:
helper.fit_single_cont(sample_df.tot_tr, sample_df.bad_ind, method = "chi", min_bin = 8, init_method = "quantile", force_mono="mono")
# force_mono="mono"  (or any other str), allows 0 turning point in bad_rate

Unnamed: 0,bin,total,total_rate,bad,bad_rate,woe,iv
0,"(-990000.0, 8.0]",1287,0.220188,365,0.283605,0.926648,0.088305
1,"(8.0, 13.0]",981,0.167836,233,0.237513,1.166365,0.102768
2,"(13.0, 16.0]",587,0.100428,110,0.187394,1.467036,0.092113
3,"(16.0, 23.0]",1184,0.0923867,187,0.157939,1.673642,0.10578
4,"(23.0, inf]",1347,0.0210436,186,0.138085,1.83129,0.027894
5,[nan],361,0.0617622,96,0.265928,1.015382,0.029358
6,[-999999.0],9,0.00153978,0,0.0,29.567536,0.045527
7,[-999902.0],59,0.0100941,11,0.186441,1.473306,0.009326
8,[-999901.0],30,0.00513259,9,0.3,0.847298,0.00174


## init_merge_small_bin

In [None]:
helper = VarBinHelper()

In [None]:
sample_df.used_ind.value_counts()

0    5000
1     500
7      50
3      50
6      50
2      50
5      50
4      50
8      45
Name: used_ind, dtype: int64

In [None]:
5845*0.01
# min bin size --> all values above except 0,1, should merge

58.45

In [None]:
helper.init_cont(sample_df.used_ind, sample_df.bad_ind, min_sample = 0.02, init_merge_small_bin=False)[1]

Unnamed: 0_level_0,bin,bin_low,bin_up,total,total_rate,bad,bad_rate
bin_num,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,"(-990000, 0]",-990000,0.0,5000,0.855432,1025,0.205
1,"(0, 1]",0,1.0,500,0.0855432,98,0.196
2,"(1, 2]",1,2.0,50,0.00855432,15,0.3
3,"(2, 3]",2,3.0,50,0.00855432,8,0.16
4,"(3, 4]",3,4.0,50,0.00855432,11,0.22
5,"(4, 5]",4,5.0,50,0.00855432,9,0.18
6,"(5, 6]",5,6.0,50,0.00855432,6,0.12
7,"(6, 7]",6,7.0,50,0.00855432,14,0.28
8,"(7, inf]",7,inf,45,0.00769889,11,0.244444


In [None]:
helper.init_cont(sample_df.used_ind, sample_df.bad_ind, min_sample = 0.01,  init_method = "quantile")[1]

Unnamed: 0,bin,bin_low,bin_up,total,total_rate,bad,bad_rate
0,"(-990000, 0]",-990000,0.0,5000,0.855432,1025,0.205
1,"(0, 1]",0,1.0,500,0.0855432,98,0.196
2,"(1, 3]",1,3.0,100,0.00855432,23,0.23
3,"(3, 6]",3,6.0,150,0.00855432,26,0.173333
4,"(6, inf]",6,inf,95,0.00855432,25,0.263158


In [None]:
helper.init_cont(sample_df.tot_rev_debt, sample_df.bad_ind, min_sample = 0.01, init_method = "step", init_merge_small_bin=False)[1]

Unnamed: 0,bin,bin_low,bin_up,total,total_rate,bad,bad_rate
0,"(-990000, 0.0]",-990000.0,0.0,304,0.0520103,50,0.164474
1,"(0.0, 252.721586]",0.0,252.7216,412,0.0704876,83,0.201456
2,"(252.721586, 505.443172]",252.721586,505.4432,323,0.0552609,69,0.213622
3,"(505.443172, 758.164758]",505.443172,758.1648,259,0.0443114,67,0.258687
4,"(758.164758, 1010.886344]",758.164758,1010.886,214,0.0366125,35,0.163551
5,"(1010.886344, 1263.60793]",1010.886344,1263.608,189,0.0323353,38,0.201058
6,"(1263.60793, 1516.329516]",1263.60793,1516.33,157,0.0268606,39,0.248408
7,"(1516.329516, 1769.051102]",1516.329516,1769.051,170,0.0290847,39,0.229412
8,"(1769.051102, 2021.772688]",1769.051102,2021.773,132,0.0225834,30,0.227273
9,"(2021.772688, 2274.494274]",2021.772688,2274.494,147,0.0251497,29,0.197279


In [None]:
test_sr = sample_df.used_ind.sort_values().reset_index(drop=True)

In [None]:
test_sr

0       0
1       0
2       0
3       0
4       0
       ..
5840    8
5841    8
5842    8
5843    8
5844    8
Name: used_ind, Length: 5845, dtype: int64

In [None]:
test_sr[test_sr>4].index[0]

5650

# Draft force_mono "u_shape"

In [None]:
helper = VarBinHelper()
helper.find_turn_count([0,0.1,0.2,0.3,0.4,0.3])

1

In [None]:
helper.choose_turning_point_and_neighbor([0,0.1,0.2,0.3,0.4,0.3])

(3, 4, 4)

In [None]:
test_df = helper_chi_u_shape.model.bin_info[16].copy()

In [None]:
test_df.bad_rate[1] = 0.50
test_df.bad[1] = 588/2

test_df.bad_rate[0] = 85/338
test_df.bad[0] = 85

test_df

Unnamed: 0,bin,total,total_rate,bad,bad_rate,woe,iv,var,is_cat
0,"(-990000.0, 606.879872]",338,0.0578272,85,0.251479,0.071036,0.000146,fico_score,False
1,"(606.879872, 638.888816]",588,0.100599,294,0.5,0.428193,0.009084,fico_score,False
2,"(638.888816, 653.438336]",468,0.0417451,157,0.33547,0.683547,0.00939,fico_score,False
3,"(653.438336, 663.138016]",323,0.0552609,87,0.26935,0.997924,0.025439,fico_score,False
4,"(663.138016, 689.327152]",940,0.160821,195,0.207447,1.340385,0.126127,fico_score,False
5,"(689.327152, 717.456224]",985,0.16852,149,0.151269,1.724682,0.202713,fico_score,False
6,"(717.456224, 722.306064]",156,0.0266895,18,0.115385,2.036882,0.041818,fico_score,False
7,"(722.306064, 759.164848]",928,0.158768,65,0.0700431,2.586027,0.353062,fico_score,False
8,"(759.164848, inf]",805,0.137725,26,0.0322981,3.399915,0.438004,fico_score,False
9,[nan],314,0.0537211,105,0.334395,0.688374,0.012248,fico_score,False


In [None]:
helper.force_monotone(test_df[:-1], force_mono="u_shape")

Unnamed: 0,bin,total,total_rate,bad,bad_rate,woe,iv,var,is_cat
0,"(-990000.0, 663.138016]",1717,0.0578272,623,0.362842,0.071036,0.000146,fico_score,False
1,"(663.138016, 689.327152]",940,0.160821,195,0.207447,1.340385,0.126127,fico_score,False
2,"(689.327152, 717.456224]",985,0.16852,149,0.151269,1.724682,0.202713,fico_score,False
3,"(717.456224, 722.306064]",156,0.0266895,18,0.115385,2.036882,0.041818,fico_score,False
4,"(722.306064, 759.164848]",928,0.158768,65,0.0700431,2.586027,0.353062,fico_score,False
5,"(759.164848, inf]",805,0.137725,26,0.0322981,3.399915,0.438004,fico_score,False


In [None]:
test_df = helper_chi_u_shape.model.bin_info[16].copy()
test_df.bad_rate[8] = 149/928
test_df.bad[8] = 149
test_df

Unnamed: 0,bin,total,total_rate,bad,bad_rate,woe,iv,var,is_cat
0,"(-990000.0, 606.879872]",338,0.0578272,163,0.482249,0.071036,0.000146,fico_score,False
1,"(606.879872, 638.888816]",588,0.100599,232,0.394558,0.428193,0.009084,fico_score,False
2,"(638.888816, 653.438336]",468,0.0417451,157,0.33547,0.683547,0.00939,fico_score,False
3,"(653.438336, 663.138016]",323,0.0552609,87,0.26935,0.997924,0.025439,fico_score,False
4,"(663.138016, 689.327152]",940,0.160821,195,0.207447,1.340385,0.126127,fico_score,False
5,"(689.327152, 717.456224]",985,0.16852,149,0.151269,1.724682,0.202713,fico_score,False
6,"(717.456224, 722.306064]",156,0.0266895,18,0.115385,2.036882,0.041818,fico_score,False
7,"(722.306064, 759.164848]",928,0.158768,65,0.0700431,2.586027,0.353062,fico_score,False
8,"(759.164848, inf]",805,0.137725,149,0.16056,3.399915,0.438004,fico_score,False
9,[nan],314,0.0537211,105,0.334395,0.688374,0.012248,fico_score,False


In [None]:
helper.force_monotone(test_df[:-1], force_mono="u_shape")

Unnamed: 0,bin,total,total_rate,bad,bad_rate,woe,iv,var,is_cat
0,"(-990000.0, 606.879872]",338,0.0578272,163,0.482249,0.071036,0.000146,fico_score,False
1,"(606.879872, 638.888816]",588,0.100599,232,0.394558,0.428193,0.009084,fico_score,False
2,"(638.888816, 653.438336]",468,0.0417451,157,0.33547,0.683547,0.00939,fico_score,False
3,"(653.438336, 663.138016]",323,0.0552609,87,0.26935,0.997924,0.025439,fico_score,False
4,"(663.138016, 689.327152]",940,0.160821,195,0.207447,1.340385,0.126127,fico_score,False
5,"(689.327152, inf]",2874,0.16852,381,0.132568,1.724682,0.202713,fico_score,False


In [None]:
940*0.33

310.2

In [None]:
test_df = helper_chi_u_shape.model.bin_info[16].copy()

In [None]:
test_df.bad_rate[6] = 30/156
test_df.bad[6] = 30

test_df.bad_rate[7] = 300/928
test_df.bad[7] = 300

test_df.bad_rate[8] = 320/805
test_df.bad[8] = 320

test_df

Unnamed: 0,bin,total,total_rate,bad,bad_rate,woe,iv,var,is_cat
0,"(-990000.0, 606.879872]",338,0.0578272,163,0.482249,0.071036,0.000146,fico_score,False
1,"(606.879872, 638.888816]",588,0.100599,232,0.394558,0.428193,0.009084,fico_score,False
2,"(638.888816, 653.438336]",468,0.0417451,157,0.33547,0.683547,0.00939,fico_score,False
3,"(653.438336, 663.138016]",323,0.0552609,87,0.26935,0.997924,0.025439,fico_score,False
4,"(663.138016, 689.327152]",940,0.160821,195,0.207447,1.340385,0.126127,fico_score,False
5,"(689.327152, 717.456224]",985,0.16852,149,0.151269,1.724682,0.202713,fico_score,False
6,"(717.456224, 722.306064]",156,0.0266895,30,0.192308,2.036882,0.041818,fico_score,False
7,"(722.306064, 759.164848]",928,0.158768,300,0.323276,2.586027,0.353062,fico_score,False
8,"(759.164848, inf]",805,0.137725,320,0.397516,3.399915,0.438004,fico_score,False
9,[nan],314,0.0537211,105,0.334395,0.688374,0.012248,fico_score,False


In [None]:
helper.force_monotone(test_df[:-1], force_mono="u_shape")

Unnamed: 0,bin,total,total_rate,bad,bad_rate,woe,iv,var,is_cat
0,"(-990000.0, 606.879872]",338,0.0578272,163,0.482249,0.071036,0.000146,fico_score,False
1,"(606.879872, 638.888816]",588,0.100599,232,0.394558,0.428193,0.009084,fico_score,False
2,"(638.888816, 653.438336]",468,0.0417451,157,0.33547,0.683547,0.00939,fico_score,False
3,"(653.438336, 663.138016]",323,0.0552609,87,0.26935,0.997924,0.025439,fico_score,False
4,"(663.138016, 689.327152]",940,0.160821,195,0.207447,1.340385,0.126127,fico_score,False
5,"(689.327152, 717.456224]",985,0.16852,149,0.151269,1.724682,0.202713,fico_score,False
6,"(717.456224, 722.306064]",156,0.0266895,30,0.192308,2.036882,0.041818,fico_score,False
7,"(722.306064, 759.164848]",928,0.158768,300,0.323276,2.586027,0.353062,fico_score,False
8,"(759.164848, inf]",805,0.137725,320,0.397516,3.399915,0.438004,fico_score,False
