In [15]:
import pandas as pd
import numpy as np
from westgate.flaml_model import *
from scipy.stats import ttest_ind

In [3]:
accepted_df = pd.read_csv('../data/AttributesLoans2022.csv', encoding='latin')
refused_raw_df = pd.read_csv('../data/refusal2022.csv', encoding='latin')

  accepted_df = pd.read_csv('../data/AttributesLoans2022.csv', encoding='latin')


In [4]:
refused_raw_df['recurring_deposits_90_days'] = (
    refused_raw_df['recurring_deposits_current_month'] + 
    refused_raw_df['recurring_deposits_previous_month'] +
    refused_raw_df['recurring_deposits_2_months_ago']
)

refused_raw_df['sum_micro_loans_60_days'] = (
    refused_raw_df['sum_micro_loan_payments_current_month'] + 
    refused_raw_df['sum_micro_loan_payments_previous_month'] 
)

refused_raw_df['recurring_deposits_90_days'] = (
    refused_raw_df['recurring_deposits_current_month'] + 
    refused_raw_df['recurring_deposits_previous_month'] +
    refused_raw_df['recurring_deposits_2_months_ago']
)

refused_raw_df['sum_micro_loans_60_days'] = (
    refused_raw_df['sum_micro_loan_payments_current_month'] + 
    refused_raw_df['sum_micro_loan_payments_previous_month'] 
)

In [5]:
print('# loans before auto-refusal: ' + str(len(refused_raw_df)))

refused_df = refused_raw_df[refused_raw_df['account_age_days'] >= 85]

refused_df = refused_df[refused_df['count_nsf_90_days'] <= 8]

refused_df = refused_df[refused_df['count_nsf_30_days'] <= 6]

refused_df = refused_df[refused_df['count_stop_payment_90_days'] <= 4]

#refused_df = refused_df[refused_df['recurring_deposits_90_days']/3.0 >= 1800] <-- filers way too much

#refused_df = refused_df[refused_df['sum_micro_loans_60_days'] <= 1000]

print('# loans after auto-refusal: ' + str(len(refused_df)))

# loans before auto-refusal: 37398
# loans after auto-refusal: 28579


In [8]:
default_model = load_model(experiment_id='default_1.0', basefolder='../default_model/')

In [12]:
refusal_model = load_model(experiment_id='refusal_0.2', basefolder='../refusal_model/')

In [9]:
accepted_filtered_df = default_model.filter_df(accepted_df)

Rows with 'account_age_days' column not positive will be discarded.


In [10]:
accepted_filtered_df['refusal'] = 0
refused_df['refusal'] = 1

df = pd.concat([accepted_filtered_df, refused_df])

In [23]:
df.columns

Index(['Id', 'account_age_days', 'active_days_trend',
       'auto_loan_payment_frequency',
       'average_closing_balance_day_after_employer_income',
       'average_closing_balance_day_after_government_income',
       'average_closing_balance_day_after_income',
       'average_closing_balance_day_of_employer_income',
       'average_closing_balance_day_of_government_income',
       'average_closing_balance_day_of_income',
       ...
       'loan_application_id', 'borrower_id', 'error', 'refusal',
       'approbexRequest', 'province', 'refusalReason',
       'recurring_deposits_90_days', 'sum_micro_loans_60_days', 'age'],
      dtype='object', length=762)

# Count variables

In [172]:
count_vars = [c for c in df.columns if c.startswith('count_')]

In [173]:
len(count_vars)

304

In [174]:
stats = {'c': [], 'pvalue': [], 'stat': []}

for c in count_vars:
    refused = df.loc[df['refusal']==1, c]
    accepted = df.loc[df['refusal']==0, c]
    result = ttest_ind(refused, accepted, equal_var=False)
    stats['c'].append(c)
    stats['pvalue'].append(result.pvalue)
    stats['stat'].append(result.statistic)

In [175]:
count_pvalues_df = pd.DataFrame(stats)

In [176]:
count_pvalues_df['abs_stat'] = count_pvalues_df['stat'].abs()

In [177]:
count_pvalues_filtered_df = count_pvalues_df[count_pvalues_df['pvalue'] < 0.001].sort_values('abs_stat', ascending=False)

In [178]:
count_pvalues_filtered_df

Unnamed: 0,c,pvalue,stat,abs_stat
45,count_employer_a_income_2_months_ago,0.000000e+00,-42.362517,42.362517
136,count_loan_payments_2_months_ago,0.000000e+00,-40.624764,40.624764
145,count_loan_payments_previous_month,0.000000e+00,-39.937455,39.937455
54,count_employer_a_income_previous_month,0.000000e+00,-39.623883,39.623883
41,count_distinct_micro_lenders,1.604758e-305,-38.334076,38.334076
...,...,...,...,...
168,count_mortgage_payments_8_months_ago,4.515084e-04,-3.508840,3.508840
169,count_mortgage_payments_9_months_ago,4.847694e-04,-3.489862,3.489862
36,count_disability_income_7_months_ago,8.532061e-04,3.335457,3.335457
207,count_pension_income_10_months_ago,9.387601e-04,-3.309000,3.309000


In [179]:
count_pvalues_filtered_df.to_csv('refusal_count_pvalues_filtered.csv', index=False)

In [180]:
count_top10_df = count_pvalues_filtered_df.iloc[0:10]

In [181]:
count_top10_df

Unnamed: 0,c,pvalue,stat,abs_stat
45,count_employer_a_income_2_months_ago,0.0,-42.362517,42.362517
136,count_loan_payments_2_months_ago,0.0,-40.624764,40.624764
145,count_loan_payments_previous_month,0.0,-39.937455,39.937455
54,count_employer_a_income_previous_month,0.0,-39.623883,39.623883
41,count_distinct_micro_lenders,1.604758e-305,-38.334076,38.334076
149,count_micro_loan_payments_2_months_ago,6.095917e-292,-37.511878,37.511878
158,count_micro_loan_payments_previous_month,7.26544e-282,-36.787331,36.787331
137,count_loan_payments_3_months_ago,5.2832739999999995e-266,-35.722353,35.722353
150,count_micro_loan_payments_3_months_ago,4.583513e-237,-33.622316,33.622316
46,count_employer_a_income_3_months_ago,2.246586e-235,-33.309525,33.309525


In [182]:
count_top10_corr = df[[c for c in count_top10_df['c']]].corr(method='spearman')

In [183]:
count_top10_corr_long = count_top10_corr\
                        .reset_index()\
                        .melt(id_vars = ['index']).rename(columns={'index':'var1', 'variable':'var2'})

In [184]:
count_top10_corr_long.query('(var1 != var2) & (value >= 0.9)')

Unnamed: 0,var1,var2,value
12,count_loan_payments_previous_month,count_loan_payments_2_months_ago,0.900313
21,count_loan_payments_2_months_ago,count_loan_payments_previous_month,0.900313


# Sum variables

In [145]:
sum_vars = [c for c in df.columns if c.startswith('sum_')]

In [147]:
len(sum_vars)

302

In [149]:
stats = {'c': [], 'pvalue': [], 'stat': []}

for c in sum_vars:
    refused = df.loc[df['refusal']==1, c]
    accepted = df.loc[df['refusal']==0, c]
    result = ttest_ind(refused, accepted, equal_var=False)
    stats['c'].append(c)
    stats['pvalue'].append(result.pvalue)
    stats['stat'].append(result.statistic)

In [158]:
sum_pvalues_df = pd.DataFrame(stats)

In [159]:
sum_pvalues_df['abs_stat'] = sum_pvalues_df['stat'].abs()

In [160]:
sum_pvalues_filtered_df = sum_pvalues_df[sum_pvalues_df['pvalue'] < 0.001].sort_values('abs_stat', ascending=False)

In [161]:
sum_top10_df = sum_pvalues_filtered_df.iloc[0:10]

In [165]:
sum_top10_df

Unnamed: 0,c,pvalue,stat,abs_stat
77,sum_employer_income_previous_month,0.0,-47.01047,47.01047
68,sum_employer_income_2_months_ago,0.0,-45.207414,45.207414
42,sum_employer_a_income_2_months_ago,0.0,-43.913843,43.913843
51,sum_employer_a_income_previous_month,0.0,-43.284676,43.284676
69,sum_employer_income_3_months_ago,1.3335610000000002e-219,-32.294065,32.294065
149,sum_loan_payments_previous_month,1.84223e-219,-32.17472,32.17472
140,sum_loan_payments_2_months_ago,2.381645e-201,-30.790564,30.790564
136,sum_loan_payments,4.595155e-189,-29.824807,29.824807
43,sum_employer_a_income_3_months_ago,2.010103e-188,-29.808918,29.808918
153,sum_micro_loan_payments_2_months_ago,6.719752e-174,-28.532632,28.532632


In [162]:
sum_top10_corr = df[[c for c in sum_top10_df['c']]].corr(method='spearman')

In [163]:
sum_top10_corr_long = sum_top10_corr\
                        .reset_index()\
                        .melt(id_vars = ['index']).rename(columns={'index':'var1', 'variable':'var2'})

In [164]:
sum_top10_corr_long.query('(var1 != var2) & (value >= 0.9)')

Unnamed: 0,var1,var2,value


# Combined

In [185]:
combined_df = pd.concat([count_top10_df, sum_top10_df])

In [195]:
combined_df.sort_values('abs_stat', ascending=False, inplace=True)

In [198]:
combined_df

Unnamed: 0,c,pvalue,stat,abs_stat
77,sum_employer_income_previous_month,0.0,-47.01047,47.01047
68,sum_employer_income_2_months_ago,0.0,-45.207414,45.207414
42,sum_employer_a_income_2_months_ago,0.0,-43.913843,43.913843
51,sum_employer_a_income_previous_month,0.0,-43.284676,43.284676
45,count_employer_a_income_2_months_ago,0.0,-42.362517,42.362517
136,count_loan_payments_2_months_ago,0.0,-40.624764,40.624764
145,count_loan_payments_previous_month,0.0,-39.937455,39.937455
54,count_employer_a_income_previous_month,0.0,-39.623883,39.623883
41,count_distinct_micro_lenders,1.604758e-305,-38.334076,38.334076
149,count_micro_loan_payments_2_months_ago,6.095917e-292,-37.511878,37.511878


In [188]:
top20_corr = df[[c for c in combined_df['c']]].corr(method='spearman')

In [189]:
top20_corr_long = top20_corr\
                        .reset_index()\
                        .melt(id_vars = ['index']).rename(columns={'index':'var1', 'variable':'var2'})

In [194]:
top20_corr_long.to_csv('refusal_top20_corr.csv', index=False)

In [197]:
highly_correlated_df = top20_corr_long.query('(var1 != var2) & (value >= 0.9)')
highly_correlated_df

Unnamed: 0,var1,var2,value
22,count_loan_payments_previous_month,count_loan_payments_2_months_ago,0.900313
36,sum_loan_payments_2_months_ago,count_loan_payments_2_months_ago,0.926035
41,count_loan_payments_2_months_ago,count_loan_payments_previous_month,0.900313
55,sum_loan_payments_previous_month,count_loan_payments_previous_month,0.91818
119,sum_micro_loan_payments_2_months_ago,count_micro_loan_payments_2_months_ago,0.954152
198,sum_employer_a_income_3_months_ago,count_employer_a_income_3_months_ago,0.933543
302,count_loan_payments_previous_month,sum_loan_payments_previous_month,0.91818
321,count_loan_payments_2_months_ago,sum_loan_payments_2_months_ago,0.926035
369,count_employer_a_income_3_months_ago,sum_employer_a_income_3_months_ago,0.933543
385,count_micro_loan_payments_2_months_ago,sum_micro_loan_payments_2_months_ago,0.954152


In [201]:
final_df = combined_df.merge(highly_correlated_df, left_on='c', right_on='var1', how='left')

In [202]:
final_df.to_csv('refusal_top_features.csv', index=False)