In [94]:
import pandas as pd
from pandas.api.types import is_numeric_dtype
from westgate.flaml_model import *
from sklearn.feature_selection import mutual_info_classif
import random
from collections import defaultdict
import math
from tqdm import tqdm

from flaml import AutoML

In [3]:
refused_raw_df = pd.read_csv('../data/refusal2022.csv', encoding='latin')

In [4]:
refused_raw_df['recurring_deposits_90_days'] = (
    refused_raw_df['recurring_deposits_current_month'] + 
    refused_raw_df['recurring_deposits_previous_month'] +
    refused_raw_df['recurring_deposits_2_months_ago']
)

refused_raw_df['sum_micro_loans_60_days'] = (
    refused_raw_df['sum_micro_loan_payments_current_month'] + 
    refused_raw_df['sum_micro_loan_payments_previous_month'] 
)

refused_raw_df['recurring_deposits_90_days'] = (
    refused_raw_df['recurring_deposits_current_month'] + 
    refused_raw_df['recurring_deposits_previous_month'] +
    refused_raw_df['recurring_deposits_2_months_ago']
)

refused_raw_df['sum_micro_loans_60_days'] = (
    refused_raw_df['sum_micro_loan_payments_current_month'] + 
    refused_raw_df['sum_micro_loan_payments_previous_month'] 
)

In [5]:
print('# loans before auto-refusal: ' + str(len(refused_raw_df)))

refused_df = refused_raw_df[refused_raw_df['account_age_days'] >= 85]

refused_df = refused_df[refused_df['count_nsf_90_days'] <= 8]

refused_df = refused_df[refused_df['count_nsf_30_days'] <= 6]

refused_df = refused_df[refused_df['count_stop_payment_90_days'] <= 4]

#refused_df = refused_df[refused_df['recurring_deposits_90_days']/3.0 >= 1800] <-- filers way too much

#refused_df = refused_df[refused_df['sum_micro_loans_60_days'] <= 1000]

print('# loans after auto-refusal: ' + str(len(refused_df)))

# loans before auto-refusal: 37398
# loans after auto-refusal: 28579


In [6]:
accepted_raw_df = pd.read_csv('../data/AttributesLoans2022.csv', encoding='latin')

  accepted_raw_df = pd.read_csv('../data/AttributesLoans2022.csv', encoding='latin')


In [7]:
len(accepted_raw_df)

10098

In [8]:
def filter_accepted_df(df):
    if 'error' in df.columns:
        df = df[df['error'].isna()]
    if 'Id' in df.columns:
        df = df[~df['Id'].isna()]
    if 'request_date' in df.columns:
        df = df[~df['request_date'].isna()]
    if 'account_age_days' in df.columns:
        df = df[df['account_age_days'] > 0]
    return df

In [9]:
accepted_df = filter_accepted_df(accepted_raw_df)

In [10]:
len(accepted_df)

9633

In [11]:
accepted_df.drop(columns=['requestId', 'fees_paid', 'first_repayment_date', 
                          'last_repayment_date', 'repayment_count', 'error'],
                inplace=True)

In [12]:
na_cols = accepted_df.isna().sum()
na_cols[na_cols > 0]

average_monthly_auto_loan_payments_complex                   8
average_monthly_child_support_income_government_complex     39
average_monthly_government_income_complex                   12
average_monthly_loan_deposits_complex                       15
average_monthly_loan_payments_complex                       31
average_monthly_micro_loan_payments_complex                 27
average_monthly_non_employer_income_complex                 13
average_monthly_other_loan_payments_complex                 16
average_monthly_pension_income_complex                      24
average_monthly_social_assistance_income_complex           103
average_monthly_student_loan_payments_complex                1
average_monthly_telecom_payments_complex                    24
average_monthly_total_income_complex                        32
average_monthly_utility_payments_complex                     9
average_monthly_wsib_income_complex                          4
estimated_annual_net_employer_income                   

In [13]:
accepted_df.fillna(-1, inplace=True)

In [14]:
accepted_df['refusal'] = 0
refused_df['refusal'] = 1

In [15]:
na_cols = refused_df.isna().sum()
na_cols[na_cols > 0]

average_monthly_child_support_income_government_complex     106
average_monthly_pension_income_complex                       57
average_monthly_social_assistance_income_complex            476
average_monthly_wsib_income_complex                          18
balance_max                                                   2
balance_min                                                   6
estimated_annual_net_employer_income                       6162
dtype: int64

In [16]:
refused_df['average_monthly_child_support_income_government_complex'].fillna(-1, inplace=True)
refused_df['average_monthly_pension_income_complex'].fillna(-1, inplace=True)
refused_df['average_monthly_social_assistance_income_complex'].fillna(-1, inplace=True)
refused_df['average_monthly_wsib_income_complex'].fillna(-1, inplace=True)
refused_df['estimated_annual_net_employer_income'].fillna(-1, inplace=True)

In [17]:
na_cols = refused_df.isna().sum()
na_cols[na_cols > 0]

balance_max    2
balance_min    6
dtype: int64

In [18]:
refused_df.dropna(inplace=True)

In [19]:
refused_df.isna().sum().sum()

0

In [20]:
accepted_df.isna().sum().sum()

0

In [21]:
common_cols = [c for c in refused_df.columns if c in accepted_df.columns]
refusal_only_cols = [c for c in refused_df.columns if c not in accepted_df.columns]
default_only_cols = [c for c in accepted_df.columns if c not in refused_df.columns]

In [22]:
refusal_only_cols

['approbexRequest',
 'province',
 'refusalReason',
 'recurring_deposits_90_days',
 'sum_micro_loans_60_days']

In [23]:
default_only_cols

['Id', 'principal', 'total_paid', 'loan_application_id', 'borrower_id']

In [24]:
df = pd.concat([accepted_df[common_cols], refused_df[common_cols]])

In [25]:
len(df)

38205

In [26]:
df.isna().sum().sum()

0

In [27]:
exclude_cols = ['approbexRequest', 'loginId', 'Id', 'requestDate', 'refusalReason', 
                'MostRecentTransactionDate', 'error', 'borrower_id', 'loan_application_id',
                'repayment_count', 'request_id', 'status', 'total_paid', 'fees_paid', 
                'first_repayment_date', 'last_repayment_date']

df = df[[c for c in df.columns if c not in exclude_cols]]

In [28]:
pd.factorize(df['gender'])

(array([0, 0, 1, ..., 0, 0, 1], dtype=int64),
 Index(['male', 'female'], dtype='object'))

In [29]:
def feature_engineer(df: pd.DataFrame) -> pd.DataFrame:

    def time_diff(df):
        try:
            request_date = df['request_date']
            dob = df['dob']
            request_date = parse(request_date)
            dob = parse(dob)
            return relativedelta(request_date, dob).years
        except Exception as e:
            print('Problematic value for dob ' + str(dob))
    
    assert 'dob' in df.columns
    assert is_string_dtype(df['dob'])
    assert 'request_date' in df.columns
    assert is_string_dtype(df['request_date'])

    df['dob'] = df['dob'].str[:10]
    df['request_date'] = df['request_date'].str[:10]

    df['age'] = df[['request_date', 'dob']].apply(time_diff, axis=1)

    return df

In [30]:
df = feature_engineer(df)

Problematic value for dob ##########
Problematic value for dob ##########
Problematic value for dob ##########
Problematic value for dob ##########


In [31]:
df['age'].isna().sum()

4

In [32]:
df.dropna(inplace=True)

In [33]:
df.drop(columns=['request_date', 'dob'], inplace=True)

In [34]:
for c in df:
    if not is_numeric_dtype(df[c]):
        if df[c].nunique() < 20:
            try:
                values, uniques = pd.factorize(df[c])
                df[c] = values
            except Exception as e:
                print('Exception while factorizing colunm ' + str(c))
                print(e)
        else:
            print('dropping column ' + str(c))
            df.drop(c, axis=1, inplace=True)

dropping column average_closing_balance_day_after_government_income
dropping column average_closing_balance_day_of_government_income
dropping column employer_a_name
dropping column employer_b_name
dropping column employer_name
dropping column employer_other_name
dropping column last_2_paydates_child_support_income_government
dropping column last_2_paydates_disability_income
dropping column last_2_paydates_employer_a
dropping column last_2_paydates_employer_b
dropping column last_2_paydates_employer_other
dropping column last_2_paydates_employment_insurance_income
dropping column last_2_paydates_government_income
dropping column last_2_paydates_pension_income
dropping column last_2_paydates_social_assistance_income
dropping column last_2_paydates_wsib_income
dropping column micro_lender_name
dropping column last_2_social_assistance_income_dates
dropping column last_2_employment_insurance_income_dates
dropping column last_2_wsib_income_dates
dropping column last_2_pension_income_dates
dr

In [36]:
len(df.columns)

718

In [37]:
df.columns

Index(['gender', 'account_age_days', 'active_days_trend',
       'auto_loan_payment_frequency',
       'average_closing_balance_day_after_employer_income',
       'average_closing_balance_day_after_income',
       'average_closing_balance_day_of_employer_income',
       'average_closing_balance_day_of_income', 'average_disability_deposit',
       'average_employer_a_income_deposit',
       ...
       'student_loan_payments_frequency', 'telecom_payments_frequency',
       'telecom_payments_average', 'other_loan_payments_frequency',
       'other_loan_payments_average', 'utility_payments_frequency',
       'utility_payments_average', 'average_monthly_insurance_income_complex',
       'refusal', 'age'],
      dtype='object', length=718)

In [45]:
train_df = df[[c for c in df.columns if c != 'refusal']]

In [46]:
target = df['refusal']

In [54]:
random_cols = random.sample(sorted(df.columns), 5)

In [55]:
random_cols

['count_micro_loan_payments_2_months_ago',
 'sum_social_assistance_income_4_months_ago',
 'telecom_payments_frequency',
 'average_monthly_employer_other_income_complex',
 'count_student_loan_payments_5_months_ago']

In [95]:
results = []
n_cols = int(math.sqrt(len(df.columns)))
np.random.seed(123)

for iter in tqdm(range(100)):

    random_cols = random.sample(sorted(df.columns), n_cols)

    automl_settings = {
        "time_budget": 10,  # in seconds
        "metric": "roc_auc",
        "task": 'classification',
        "estimator_list": ['rf'],
        "eval_method": "cv",
        "n_splits": 5,
        "retrain_full": False,
        "verbose": 2
    }

    automl = AutoML()

    automl.fit(train_df, target, **automl_settings)

    loss = automl.best_loss

    r = {random_cols[i]:loss for i in range(len(random_cols))}

    results.append(r)


100%|██████████| 100/100 [15:56:34<00:00, 573.95s/it]     


In [96]:
agg_results = defaultdict(list)

for r in results:
    for k,v in r.items():
        agg_results[k].append(v)

In [97]:
final_results = {k:[len(v), sum(v)/len(v)] for k,v in agg_results.items()}

In [99]:
pd.DataFrame.from_records(list(final_results.values()), 
                               index=final_results.keys(),
                               columns = ['count', 'average_loss']).sort_values('count', ascending=False)

Unnamed: 0,count,average_loss
gender,10,inf
count_mortgage_payments_9_months_ago,10,inf
sum_government_income_11_months_ago,9,inf
recurring_deposits_3_months_ago,9,0.279370
average_monthly_micro_loan_payments_complex,9,inf
...,...,...
sum_employment_insurance_income_9_months_ago,1,0.254814
count_student_loan_payments_4_months_ago,1,0.254814
sum_micro_loan_payments_previous_month,1,0.254814
sum_telecom_payments_2_months_ago,1,0.253517


In [67]:
pd.DataFrame(results[0], index=[0])

Unnamed: 0,feature_0,feature_1,feature_2,feature_3,feature_4,loss
0,recurring_deposits_7_months_ago,sum_auto_loan_payments_3_months_ago,utility_payments_average,count_auto_loan_payments_4_months_ago,count_employer_b_income_10_months_ago,0.29439


In [69]:
results_df = pd.concat([pd.DataFrame(r, index=[0]) for r in results])

In [72]:
results_df.groupby('feature_0')['loss'].agg(['count', 'sum'])

Unnamed: 0_level_0,count,sum
feature_0,Unnamed: 1_level_1,Unnamed: 2_level_1
recurring_deposits_7_months_ago,1,0.29439
sum_disability_income_2_months_ago,1,0.25873
telecom_payments_frequency,1,0.299457
