In [55]:
import pandas as pd
import numpy as np
from westgate.flaml_model import *
from scipy.stats import ttest_ind
from pandas.api.types import is_numeric_dtype

In [9]:
default_model = load_model(experiment_id='default_1.0', basefolder='../default_model/')

In [2]:
accepted_df = pd.read_csv('../data/AttributesLoans2022.csv', encoding='latin')

  accepted_df = pd.read_csv('../data/AttributesLoans2022.csv', encoding='latin')


In [7]:
accepted_df['profit'] = accepted_df['total_paid'] - accepted_df['principal']

In [8]:
accepted_df[default_model.target] = np.where(accepted_df['profit'] < 50, 1, 0)

In [10]:
filtered_df = default_model.filter_df(accepted_df)

Rows with 'account_age_days' column not positive will be discarded.


In [11]:
len(filtered_df)

9507

In [12]:
df = filtered_df

# Count variables

In [34]:
'count_active_days' in default_model.features_in

False

In [16]:
count_vars = [c for c in df.columns if c.startswith('count_') and c not in default_model.features_in]

In [17]:
len(count_vars)

299

In [19]:
stats = {'c': [], 'pvalue': [], 'stat': []}

for c in count_vars:
    refused = df.loc[df['status']==1, c]
    accepted = df.loc[df['status']==0, c]
    result = ttest_ind(refused, accepted, equal_var=False)
    stats['c'].append(c)
    stats['pvalue'].append(result.pvalue)
    stats['stat'].append(result.statistic)

In [20]:
count_pvalues_df = pd.DataFrame(stats)

In [21]:
count_pvalues_df['abs_stat'] = count_pvalues_df['stat'].abs()

In [22]:
count_pvalues_filtered_df = count_pvalues_df[count_pvalues_df['pvalue'] < 0.001].sort_values('abs_stat', ascending=False)

In [36]:
count_pvalues_filtered_df[1:20]

Unnamed: 0,c,pvalue,stat,abs_stat
139,count_loan_payments_7_months_ago,3.7250869999999995e-232,-33.462038,33.462038
131,count_loan_payments_10_months_ago,9.906213e-221,-32.591626,32.591626
137,count_loan_payments_5_months_ago,4.2445750000000004e-219,-32.444533,32.444533
150,count_micro_loan_payments_7_months_ago,5.0376e-214,-32.073293,32.073293
140,count_loan_payments_8_months_ago,1.5727510000000001e-211,-31.844093,31.844093
138,count_loan_payments_6_months_ago,1.976954e-211,-31.836931,31.836931
40,count_employer_a_income_10_months_ago,8.135873e-210,-31.724561,31.724561
135,count_loan_payments_3_months_ago,1.306806e-207,-31.586113,31.586113
132,count_loan_payments_11_months_ago,2.8945940000000003e-204,-31.267276,31.267276
151,count_micro_loan_payments_8_months_ago,1.131452e-202,-31.142404,31.142404


In [37]:
count_pvalues_filtered_df.to_csv('default_count_pvalues_filtered.csv', index=False)

In [38]:
count_top10_df = count_pvalues_filtered_df.iloc[0:10]

In [39]:
count_top10_df

Unnamed: 0,c,pvalue,stat,abs_stat
0,count_active_days,9.895982e-255,-35.456276,35.456276
139,count_loan_payments_7_months_ago,3.7250869999999995e-232,-33.462038,33.462038
131,count_loan_payments_10_months_ago,9.906213e-221,-32.591626,32.591626
137,count_loan_payments_5_months_ago,4.2445750000000004e-219,-32.444533,32.444533
150,count_micro_loan_payments_7_months_ago,5.0376e-214,-32.073293,32.073293
140,count_loan_payments_8_months_ago,1.5727510000000001e-211,-31.844093,31.844093
138,count_loan_payments_6_months_ago,1.976954e-211,-31.836931,31.836931
40,count_employer_a_income_10_months_ago,8.135873e-210,-31.724561,31.724561
135,count_loan_payments_3_months_ago,1.306806e-207,-31.586113,31.586113
132,count_loan_payments_11_months_ago,2.8945940000000003e-204,-31.267276,31.267276


In [40]:
count_top10_corr = df[[c for c in count_top10_df['c']]].corr(method='spearman')

In [41]:
count_top10_corr_long = count_top10_corr\
                        .reset_index()\
                        .melt(id_vars = ['index']).rename(columns={'index':'var1', 'variable':'var2'})

In [42]:
count_top10_corr_long.query('(var1 != var2) & (value >= 0.9)')

Unnamed: 0,var1,var2,value
12,count_loan_payments_10_months_ago,count_loan_payments_7_months_ago,0.904298
13,count_loan_payments_5_months_ago,count_loan_payments_7_months_ago,0.92925
14,count_micro_loan_payments_7_months_ago,count_loan_payments_7_months_ago,0.945357
15,count_loan_payments_8_months_ago,count_loan_payments_7_months_ago,0.963249
16,count_loan_payments_6_months_ago,count_loan_payments_7_months_ago,0.961853
21,count_loan_payments_7_months_ago,count_loan_payments_10_months_ago,0.904298
25,count_loan_payments_8_months_ago,count_loan_payments_10_months_ago,0.931165
29,count_loan_payments_11_months_ago,count_loan_payments_10_months_ago,0.961818
31,count_loan_payments_7_months_ago,count_loan_payments_5_months_ago,0.92925
35,count_loan_payments_8_months_ago,count_loan_payments_5_months_ago,0.906581


# Sum variables

In [43]:
sum_vars = [c for c in df.columns if c.startswith('sum_') and c not in default_model.features_in]

In [44]:
len(sum_vars)

294

In [45]:
stats = {'c': [], 'pvalue': [], 'stat': []}

for c in sum_vars:
    refused = df.loc[df['status']==1, c]
    accepted = df.loc[df['status']==0, c]
    result = ttest_ind(refused, accepted, equal_var=False)
    stats['c'].append(c)
    stats['pvalue'].append(result.pvalue)
    stats['stat'].append(result.statistic)

In [46]:
sum_pvalues_df = pd.DataFrame(stats)

In [47]:
sum_pvalues_df['abs_stat'] = sum_pvalues_df['stat'].abs()

In [48]:
sum_pvalues_filtered_df = sum_pvalues_df[sum_pvalues_df['pvalue'] < 0.001].sort_values('abs_stat', ascending=False)

In [49]:
sum_top10_df = sum_pvalues_filtered_df.iloc[0:10]

In [50]:
sum_top10_df

Unnamed: 0,c,pvalue,stat,abs_stat
133,sum_loan_payments,2.4674290000000002e-265,-35.952638,35.952638
66,sum_employer_income_11_months_ago,9.891878e-239,-33.962609,33.962609
65,sum_employer_income_10_months_ago,1.057213e-214,-32.106313,32.106313
141,sum_loan_payments_6_months_ago,1.118555e-207,-31.538747,31.538747
143,sum_loan_payments_8_months_ago,3.857794e-198,-30.763075,30.763075
142,sum_loan_payments_7_months_ago,5.109506e-198,-30.757252,30.757252
140,sum_loan_payments_5_months_ago,8.274486000000001e-194,-30.41864,30.41864
73,sum_employer_income_8_months_ago,3.9613060000000004e-191,-30.178132,30.178132
40,sum_employer_a_income_11_months_ago,5.706387e-189,-29.99878,29.99878
139,sum_loan_payments_4_months_ago,1.505777e-188,-29.981858,29.981858


In [162]:
sum_top10_corr = df[[c for c in sum_top10_df['c']]].corr(method='spearman')

In [163]:
sum_top10_corr_long = sum_top10_corr\
                        .reset_index()\
                        .melt(id_vars = ['index']).rename(columns={'index':'var1', 'variable':'var2'})

In [164]:
sum_top10_corr_long.query('(var1 != var2) & (value >= 0.9)')

Unnamed: 0,var1,var2,value


# Other variables

In [54]:
df['status'].dtype

dtype('int32')

In [57]:
other_vars = [c for c in df.columns 
                    if not c.startswith('sum_') 
                    and not c.startswith('count_')
                    and c not in default_model.features_in
                    and is_numeric_dtype(df[c])
                    ]

In [58]:
len(other_vars)

69

In [59]:
stats = {'c': [], 'pvalue': [], 'stat': []}

for c in other_vars:
    refused = df.loc[df['status']==1, c]
    accepted = df.loc[df['status']==0, c]
    result = ttest_ind(refused, accepted, equal_var=False)
    stats['c'].append(c)
    stats['pvalue'].append(result.pvalue)
    stats['stat'].append(result.statistic)

  res = hypotest_fun_out(*samples, **kwds)


In [60]:
other_pvalues_df = pd.DataFrame(stats)

In [61]:
other_pvalues_df['abs_stat'] = other_pvalues_df['stat'].abs()

In [62]:
other_pvalues_filtered_df = other_pvalues_df[other_pvalues_df['pvalue'] < 0.001].sort_values('abs_stat', ascending=False)

In [64]:
other_pvalues_filtered_df[0:20]

Unnamed: 0,c,pvalue,stat,abs_stat
63,status,0.0,inf,inf
68,profit,0.0,-182.745738,182.745738
65,total_paid,0.0,-175.326505,175.326505
46,recurring_deposits_11_months_ago,5.180586e-305,-38.842548,38.842548
45,recurring_deposits_10_months_ago,1.736439e-281,-37.190801,37.190801
55,recurring_deposits_9_months_ago,3.271486e-250,-34.893668,34.893668
0,account_age_days,1.286926e-242,-34.5497,34.5497
47,recurring_deposits_12_months_ago,1.9214269999999998e-216,-32.259491,32.259491
54,recurring_deposits_8_months_ago,4.881763e-180,-29.333982,29.333982
53,recurring_deposits_7_months_ago,2.618009e-165,-27.99626,27.99626


# Combined

In [185]:
combined_df = pd.concat([count_top10_df, sum_top10_df])

In [195]:
combined_df.sort_values('abs_stat', ascending=False, inplace=True)

In [198]:
combined_df

Unnamed: 0,c,pvalue,stat,abs_stat
77,sum_employer_income_previous_month,0.0,-47.01047,47.01047
68,sum_employer_income_2_months_ago,0.0,-45.207414,45.207414
42,sum_employer_a_income_2_months_ago,0.0,-43.913843,43.913843
51,sum_employer_a_income_previous_month,0.0,-43.284676,43.284676
45,count_employer_a_income_2_months_ago,0.0,-42.362517,42.362517
136,count_loan_payments_2_months_ago,0.0,-40.624764,40.624764
145,count_loan_payments_previous_month,0.0,-39.937455,39.937455
54,count_employer_a_income_previous_month,0.0,-39.623883,39.623883
41,count_distinct_micro_lenders,1.604758e-305,-38.334076,38.334076
149,count_micro_loan_payments_2_months_ago,6.095917e-292,-37.511878,37.511878


In [188]:
top20_corr = df[[c for c in combined_df['c']]].corr(method='spearman')

In [189]:
top20_corr_long = top20_corr\
                        .reset_index()\
                        .melt(id_vars = ['index']).rename(columns={'index':'var1', 'variable':'var2'})

In [194]:
top20_corr_long.to_csv('refusal_top20_corr.csv', index=False)

In [197]:
highly_correlated_df = top20_corr_long.query('(var1 != var2) & (value >= 0.9)')
highly_correlated_df

Unnamed: 0,var1,var2,value
22,count_loan_payments_previous_month,count_loan_payments_2_months_ago,0.900313
36,sum_loan_payments_2_months_ago,count_loan_payments_2_months_ago,0.926035
41,count_loan_payments_2_months_ago,count_loan_payments_previous_month,0.900313
55,sum_loan_payments_previous_month,count_loan_payments_previous_month,0.91818
119,sum_micro_loan_payments_2_months_ago,count_micro_loan_payments_2_months_ago,0.954152
198,sum_employer_a_income_3_months_ago,count_employer_a_income_3_months_ago,0.933543
302,count_loan_payments_previous_month,sum_loan_payments_previous_month,0.91818
321,count_loan_payments_2_months_ago,sum_loan_payments_2_months_ago,0.926035
369,count_employer_a_income_3_months_ago,sum_employer_a_income_3_months_ago,0.933543
385,count_micro_loan_payments_2_months_ago,sum_micro_loan_payments_2_months_ago,0.954152


In [201]:
final_df = combined_df.merge(highly_correlated_df, left_on='c', right_on='var1', how='left')

In [202]:
final_df.to_csv('refusal_top_features.csv', index=False)