# Imports and define functions

In [4]:
import os
import pandas as pd
import numpy as np
from time import time as now
from scipy.stats import chisquare

In [2]:
# Chi-Squared Percent Point Function
from scipy.stats import chi2
# define probability
p = 0.95
df = 1
# retrieve value <= probability
value = chi2.ppf(p, df)
print(value)   ## the critical value for later Chi2 merging

3.841458820694124


In [5]:
## completely YL DIY
def init_equal_frequency(x, bin_rate):  ## bin_rate < 1 means each bin has same proprtion (eg. 0.05) of all samples. 
                                        ## >1 means each bin has fixed number of samples
    if bin_rate > 1:     ## find the size of bin
        bin_size = int(bin_rate)
    else: 
        bin_size = int(bin_rate*len(x))
                
    sorted_x = x.sort_values()  ## sort the varibale for later binning
    sorted_x = sorted_x.reset_index(drop=True)
    
    bin_up=[]
    bin_low =[-np.inf]
    
    index=bin_size-1
    
    while index < len(sorted_x):         ## every bin_low is exclusive, bin_up is inclusive, interval like (low,up]
        bin_up.append(sorted_x[index])
        bin_low.append(sorted_x[index])
        index+=bin_size
    
    bin_low = bin_low[:-1]
    bin_up[-1]= np.inf
    result = pd.DataFrame({'bin_low':bin_low,'bin_up':bin_up})
    result.index.name = 'bin_num'
    
    return result

In [7]:
def mapping_bin(bin_data, original_data, label):
    var_name = original_data.columns[0]
    if var_name == label:
        var_name = original_data.columns[1]
    outputDF = original_data.copy()
    outputDF['bin'] = 0
    for index, row in bin_data.iterrows():
        outputDF.loc[(outputDF[var_name]>row.bin_low) & (outputDF[var_name]<=row.bin_up),'bin'] = index
    return outputDF

In [14]:
def calc_Chi_2(bin_data, original_data_mapped, label):
    # bin_data is the output from initialisation (same frequency or same distance)
    # original_data_mapped should have 3 columns, just the X var and Y label, + mapping output
    
#     var_name = original_data_mapped.columns[0]
#     if var_name == label:
#         var_name = original_data_mapped.columns[1]   < ---- might not need
    total_bad = len(original_data_mapped.loc[original_data_mapped[label]==1])
    total_good = len(original_data_mapped.loc[original_data_mapped[label]==0])

    
    df = pd.DataFrame(columns = ["bin_low","bin_up","sample_count","bad_count","good_count","bad_count_exp","good_count_exp","Chi_2","Chi_2_if_merge"],index=bin_data.index)
    df.loc[:,['bin_low','bin_up']] = bin_data
    for index, row in df.iterrows():
        row.sample_count = len(original_data_mapped.loc[(original_data_mapped.bin == index)])
        row.bad_count = len(original_data_mapped.loc[(original_data_mapped.bin == index) & (original_data_mapped[label]==1)])
        row.good_count = len(original_data_mapped.loc[(original_data_mapped.bin == index) & (original_data_mapped[label]==0)])
        row.bad_count_exp = (row.sample_count)/len(original_data_mapped)*total_bad
        row.good_count_exp = (row.sample_count)/len(original_data_mapped)*total_good
        row.Chi_2 = chisquare([row.bad_count,row.good_count], f_exp=[row.bad_count_exp,row.good_count_exp])[0]
        if index>0:
            row.Chi_2_if_merge = row.Chi_2 + df.Chi_2[index-1]
    return df

# Sample runs

In [9]:
accept = pd.read_csv('Dataset/accepts.csv')
accept_sample = accept.loc[:,['bad_ind','msrp']]
accept_sample.shape

(5845, 2)

In [11]:
df = init_equal_frequency(accept_sample.msrp, 0.02)
df

Unnamed: 0_level_0,bin_low,bin_up
bin_num,Unnamed: 1_level_1,Unnamed: 2_level_1
0,-inf,0.0
1,0.0,5000.0
2,5000.0,6100.0
3,6100.0,7000.0
4,7000.0,7950.0
5,7950.0,8550.0
6,8550.0,9125.0
7,9125.0,9775.0
8,9775.0,10225.0
9,10225.0,10900.0


In [12]:
accept_sample_mapped = mapping_bin(df, accept_sample, 'bad_ind')
accept_sample_mapped

Unnamed: 0,bad_ind,msrp,bin
0,1,17350.0,25
1,0,19788.0,30
2,1,11450.0,11
3,1,12100.0,12
4,0,22024.0,34
...,...,...,...
5840,0,31000.0,45
5841,0,22024.0,34
5842,0,18950.0,28
5843,0,28700.0,43


In [15]:
calc_Chi_2(df,accept_sample_mapped,'bad_ind')

Unnamed: 0_level_0,bin_low,bin_up,sample_count,bad_count,good_count,bad_count_exp,good_count_exp,Chi_2,Chi_2_if_merge
bin_num,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
0,-inf,0.0,158,35,123,32.3569,125.643,0.271508,
1,0.0,5000.0,81,18,63,16.588,64.412,0.15114,0.422648
2,5000.0,6100.0,110,26,84,22.5269,87.4731,0.673347,0.824487
3,6100.0,7000.0,129,30,99,26.418,102.582,0.610772,1.28412
4,7000.0,7950.0,104,24,80,21.2982,82.7018,0.431003,1.04178
5,7950.0,8550.0,117,24,93,23.9605,93.0395,8.19743e-05,0.431085
6,8550.0,9125.0,116,27,89,23.7557,92.2443,0.55718,0.557262
7,9125.0,9775.0,115,22,93,23.5509,91.4491,0.128433,0.685614
8,9775.0,10225.0,115,24,91,23.5509,91.4491,0.0107696,0.139203
9,10225.0,10900.0,118,30,88,24.1653,93.8347,1.77161,1.78238
