In [2]:
!pip install scorecardpy



In [3]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns
import scorecardpy as sc

In [4]:
loan_data = pd.read_excel('data_v3.xlsx')

In [5]:
loan_data.shape

(50000, 9)

In [6]:
loan_data.head()

Unnamed: 0,sample_id,sample_month,adr_stability_grade,last_6m_avg_asset_total_grade,ovd_order_cnt_6m_grade,positive_biz_cnt_1y_grade,risk_score,repayment_ability_rank,is_dlq_30d
0,1,201802,10,5,1,10,1,1,0.0
1,2,201804,10,6,1,7,2,1,0.0
2,3,201803,10,5,1,7,3,1,0.0
3,4,201802,10,3,1,9,3,2,0.0
4,5,201804,10,5,1,10,1,2,0.0


In [7]:
loan_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 9 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   sample_id                      50000 non-null  int64  
 1   sample_month                   50000 non-null  int64  
 2   adr_stability_grade            50000 non-null  int64  
 3   last_6m_avg_asset_total_grade  50000 non-null  int64  
 4   ovd_order_cnt_6m_grade         50000 non-null  int64  
 5   positive_biz_cnt_1y_grade      50000 non-null  int64  
 6   risk_score                     50000 non-null  int64  
 7   repayment_ability_rank         50000 non-null  int64  
 8   is_dlq_30d                     45247 non-null  float64
dtypes: float64(1), int64(8)
memory usage: 3.4 MB


We can see there are nulls in our target variable, so the dataset we have includes approval sample and rejection sample, what we need is just the approval sample

In [9]:
data_prep = loan_data[loan_data['is_dlq_30d'].isnull() == False]

In [10]:
data_prep.shape

(45247, 9)

In [11]:
print('Bad customer concentration is {} %'.format(round(data_prep['is_dlq_30d'].mean()*100), 2))

Bad customer concentration is 8 %


In [12]:
data_prep['is_dlq_30d'] = data_prep['is_dlq_30d'].astype(int)

var = data_prep.columns.difference(['sample_id', 'sample_month']).tolist()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_prep['is_dlq_30d'] = data_prep['is_dlq_30d'].astype(int)


In [13]:
bins_info = sc.woebin(data_prep[var], y = 'is_dlq_30d', method = 'tree', breaks_list = {}, stop_limit = 0)

[INFO] creating woe binning ...


  dat.loc[:,y] = dat[y].apply(lambda x: x if pd.isnull(x) else int(x)) #dat[y].astype(int)
  datetime_cols = dat.apply(pd.to_numeric,errors='ignore').select_dtypes(object).apply(pd.to_datetime,errors='ignore').select_dtypes('datetime64').columns.tolist()
  datetime_cols = dat.apply(pd.to_numeric,errors='ignore').select_dtypes(object).apply(pd.to_datetime,errors='ignore').select_dtypes('datetime64').columns.tolist()
  init_bin = dtm.groupby('bin', group_keys=False)['y'].agg([n0, n1])\
  .agg({'good':sum, 'bad':sum}).reset_index()\
  .agg({'good':sum, 'bad':sum}).reset_index()\
  binning_1bst_brk = binning_1bst_brk.groupby(['variable', 'bstbin'], group_keys=False)\
  .agg({'good':sum, 'bad':sum}).reset_index().assign(bin=lambda x: x['bstbin'])\
  .agg({'good':sum, 'bad':sum}).reset_index().assign(bin=lambda x: x['bstbin'])\
  .agg({'good':sum, 'bad':sum}).reset_index()\
  .agg({'good':sum, 'bad':sum}).reset_index()\
  binning_1bst_brk = binning_1bst_brk.groupby(['variable', 'bstbin'], gr

In [14]:
bins_info['last_6m_avg_asset_total_grade']

Unnamed: 0,variable,bin,count,count_distr,good,bad,badprob,woe,bin_iv,total_iv,breaks,is_special_values
0,last_6m_avg_asset_total_grade,"[-inf,2.0)",13496,0.298274,11566,1930,0.143005,0.659256,0.170863,0.878497,2.0,False
1,last_6m_avg_asset_total_grade,"[2.0,3.0)",3695,0.081663,3284,411,0.111231,0.371581,0.013184,0.878497,3.0,False
2,last_6m_avg_asset_total_grade,"[3.0,4.0)",9602,0.212213,8786,816,0.084982,0.073305,0.001176,0.878497,4.0,False
3,last_6m_avg_asset_total_grade,"[4.0,5.0)",3996,0.088315,3792,204,0.051051,-0.472723,0.016212,0.878497,5.0,False
4,last_6m_avg_asset_total_grade,"[5.0,6.0)",3421,0.075607,3292,129,0.037708,-0.789633,0.034075,0.878497,6.0,False
5,last_6m_avg_asset_total_grade,"[6.0,8.0)",6353,0.140407,6256,97,0.015268,-1.71678,0.211533,0.878497,8.0,False
6,last_6m_avg_asset_total_grade,"[8.0,inf)",4684,0.103521,4676,8,0.001708,-3.920951,0.431454,0.878497,inf,False


In [15]:
# Max Bad Rate Dictionary
max_badrate = {}
for i, k in bins_info.items():
    max_badrate.update({i : round(k['badprob'].max(),2)})

max_badrate

{'risk_score': 0.54,
 'adr_stability_grade': 0.15,
 'positive_biz_cnt_1y_grade': 0.16,
 'ovd_order_cnt_6m_grade': 0.31,
 'last_6m_avg_asset_total_grade': 0.14,
 'repayment_ability_rank': 0.11}

In [16]:
# Max Lift Dictionary
max_lift = {}
bad_concen = data_prep['is_dlq_30d'].mean()

for i, k in bins_info.items():
    max_lift.update({i : round(k['badprob'].max()/bad_concen,2)})

max_lift

{'risk_score': 6.76,
 'adr_stability_grade': 1.89,
 'positive_biz_cnt_1y_grade': 2.01,
 'ovd_order_cnt_6m_grade': 3.88,
 'last_6m_avg_asset_total_grade': 1.8,
 'repayment_ability_rank': 1.37}

In [17]:
# Bin chart with lift for relevent for variales to be adjusted (max lift > 2)

var_adj = ['risk_score', 'positive_biz_cnt_1y_grade', 'ovd_order_cnt_6m_grade']

for i, k in bins_info.items():
    if i not in var_adj:
        continue
    else:
        bins_info[i]['lift'] =  round(bins_info[i]['badprob']/bad_concen, 2)
    

In [18]:
display(bins_info['ovd_order_cnt_6m_grade'].style.bar(subset=['badprob'], color='lightgreen', width=80, height=80, align='left', vmin=0).highlight_max(subset=['lift'], color='red',axis=0))

Unnamed: 0,variable,bin,count,count_distr,good,bad,badprob,woe,bin_iv,total_iv,breaks,is_special_values,lift
0,ovd_order_cnt_6m_grade,"[-inf,2.0)",37512,0.829049,35837,1675,0.044652,-0.613362,0.241951,0.79371,2.0,False,0.56
1,ovd_order_cnt_6m_grade,"[2.0,3.0)",4663,0.103057,3689,974,0.208878,1.118106,0.203903,0.79371,3.0,False,2.63
2,ovd_order_cnt_6m_grade,"[3.0,inf)",3072,0.067894,2126,946,0.307943,1.64005,0.347857,0.79371,inf,False,3.88


In [19]:
display(bins_info['positive_biz_cnt_1y_grade'].style.bar(subset=['badprob'], color='lightgreen', width=80, height=80, align='left', vmin=0).highlight_max(subset=['lift'], color='red',axis=0))

Unnamed: 0,variable,bin,count,count_distr,good,bad,badprob,woe,bin_iv,total_iv,breaks,is_special_values,lift
0,positive_biz_cnt_1y_grade,"[-inf,3.0)",5594,0.123633,4701,893,0.159635,0.788861,0.10692,0.286737,3.0,False,2.01
1,positive_biz_cnt_1y_grade,"[3.0,4.0)",2353,0.052003,2045,308,0.130897,0.556752,0.020364,0.286737,4.0,False,1.65
2,positive_biz_cnt_1y_grade,"[4.0,5.0)",2837,0.0627,2530,307,0.108213,0.340679,0.008399,0.286737,5.0,False,1.36
3,positive_biz_cnt_1y_grade,"[5.0,7.0)",6383,0.14107,5770,613,0.096036,0.207743,0.006645,0.286737,7.0,False,1.21
4,positive_biz_cnt_1y_grade,"[7.0,8.0)",3399,0.075121,3104,295,0.08679,0.096334,0.000726,0.286737,8.0,False,1.09
5,positive_biz_cnt_1y_grade,"[8.0,9.0)",3711,0.082016,3455,256,0.068984,-0.152595,0.001791,0.286737,9.0,False,0.87
6,positive_biz_cnt_1y_grade,"[9.0,10.0)",4632,0.102371,4411,221,0.047712,-0.543889,0.024163,0.286737,10.0,False,0.6
7,positive_biz_cnt_1y_grade,"[10.0,inf)",16338,0.361085,15636,702,0.042967,-0.653592,0.117728,0.286737,inf,False,0.54


In [21]:
display(bins_info['risk_score'].style.bar(subset=['badprob'], color='lightgreen', width=80, height=80, align='left', vmin=0).highlight_max(subset=['lift'], color='red',axis=0))

Unnamed: 0,variable,bin,count,count_distr,good,bad,badprob,woe,bin_iv,total_iv,breaks,is_special_values,lift
0,risk_score,"[-inf,2.0)",10706,0.236612,10673,33,0.003082,-3.32916,0.822511,2.319257,2.0,False,0.04
1,risk_score,"[2.0,3.0)",7242,0.160055,7154,88,0.012151,-1.948285,0.28694,2.319257,3.0,False,0.15
2,risk_score,"[3.0,4.0)",7887,0.17431,7635,252,0.031951,-0.961264,0.108822,2.319257,4.0,False,0.4
3,risk_score,"[4.0,5.0)",7382,0.163149,6941,441,0.05974,-0.306351,0.013471,2.319257,5.0,False,0.75
4,risk_score,"[5.0,6.0)",7097,0.15685,6241,856,0.120614,0.46318,0.040886,2.319257,6.0,False,1.52
5,risk_score,"[6.0,7.0)",2467,0.054523,1866,601,0.243616,1.316848,0.161152,2.319257,7.0,False,3.07
6,risk_score,"[7.0,inf)",2466,0.054501,1142,1324,0.536902,2.597682,0.885476,2.319257,inf,False,6.76


In [39]:
rule_set = []
rule_set.append(('ovd_order_cnt_6m_grade',2,'gt'))

rule_set.append(('risk_score', 10, 'gt'))
rule_set.append(('last_6m_avg_asset_total_grade',1,'lt'))
rule_set.append(('adr_stability_grade',2,'lt'))
rule_set.append(('positive_biz_cnt_1y_grade',1,'lt'))
rule_set.append(('repayment_ability_rank',4,'gt'))

In [135]:
def ruleset_calc(df:pd.DataFrame, lst:list):
    """
    hit_rate in rule set 
    """
    data = df.copy()
    # Create the rule hit variable with default value 0
    for n in lst:
        data[str(n[0])+'_hit'] = 0
    
    # Loop all rules and check if the borrower is hit
    for m in lst:
        if m[2] == 'lt':
            data[str(m[0])+'_hit']  = np.where(data[str(m[0])] < m[1], 1, 0)
        else:
            data[str(m[0])+'_hit']  = np.where(data[str(m[0])] > m[1], 1, 0)
            
    # Calculate the rule hits
    rule_hits = {}
    for i in lst:
        rule_hits.update({str(i[0]) + '_hit':data[str(i[0]) + '_hit'].sum()})

    # Calculate the hit rate
    rules = [r for r in data.columns if '_hit' in r]
    data['hit_any'] = data[rules].any(axis = 1)
    print('####### The Hit Rate ########')
    print('The hit rate is {}%'.format(data['hit_any'].mean()*100))

    data['hit_sum'] = data[rules].sum(axis = 1)
    print('####### Distribution of hitting #######')
    display(data['hit_sum'].value_counts(normalize = True))

    hit_info = []
    for r in rules:
        single_hit_rate = round(data[data[r] == 1].shape[0]/data.shape[0], 4)
        pure_hit_rate = round((data[(data[r] == 1)&(data['hit_sum']==1)].shape[0]/data.shape[0]), 4)
        if single_hit_rate == 0:
            single_hit_rate = 0.001
        pure_hit_pct = round(pure_hit_rate/single_hit_rate,4)
        hit_info.append((r, single_hit_rate, pure_hit_rate, pure_hit_pct))

    hit_info_df = pd.DataFrame(hit_info, columns = ['rule', 'single_hit_rate', 'pure_hit_rate', 'pure_hit_pct'])
    hit_info_df = hit_info_df.sort_values(['pure_hit_rate', 'pure_hit_pct'], ascending = [False, False])
    
    return hit_info_df, rule_hits, data

In [128]:
data_prep_2 = loan_data.drop('is_dlq_30d', axis = 1)

In [130]:
ruleset_calc(data_prep_2, rule_set)

####### The Hit Rate ########
The hit rate is 15.65%
####### Distribution of hitting #######


hit_sum
0    0.84350
1    0.15118
2    0.00532
Name: proportion, dtype: float64

(                                rule  single_hit_rate  pure_hit_rate  \
 0         ovd_order_cnt_6m_grade_hit           0.1247         0.1194   
 3            adr_stability_grade_hit           0.0371         0.0318   
 1                     risk_score_hit           0.0010         0.0000   
 2  last_6m_avg_asset_total_grade_hit           0.0010         0.0000   
 4      positive_biz_cnt_1y_grade_hit           0.0010         0.0000   
 5         repayment_ability_rank_hit           0.0010         0.0000   
 
    pure_hit_pct  
 0        0.9575  
 3        0.8571  
 1        0.0000  
 2        0.0000  
 4        0.0000  
 5        0.0000  ,
 {'ovd_order_cnt_6m_grade_hit': 6237,
  'risk_score_hit': 0,
  'last_6m_avg_asset_total_grade_hit': 0,
  'adr_stability_grade_hit': 1854,
  'positive_biz_cnt_1y_grade_hit': 0,
  'repayment_ability_rank_hit': 0})

## Results

In [137]:
hit_info_df, rule_hits, data_prep_3 = ruleset_calc(data_prep, rule_set)

####### The Hit Rate ########
The hit rate is 6.7894004022366135%
####### Distribution of hitting #######


hit_sum
0    0.932106
1    0.067894
Name: proportion, dtype: float64

In [139]:
data_prep_3.head()

Unnamed: 0,sample_id,sample_month,adr_stability_grade,last_6m_avg_asset_total_grade,ovd_order_cnt_6m_grade,positive_biz_cnt_1y_grade,risk_score,repayment_ability_rank,is_dlq_30d,ovd_order_cnt_6m_grade_hit,risk_score_hit,last_6m_avg_asset_total_grade_hit,adr_stability_grade_hit,positive_biz_cnt_1y_grade_hit,repayment_ability_rank_hit,hit_any,hit_sum
0,1,201802,10,5,1,10,1,1,0,0,0,0,0,0,0,False,0
1,2,201804,10,6,1,7,2,1,0,0,0,0,0,0,0,False,0
2,3,201803,10,5,1,7,3,1,0,0,0,0,0,0,0,False,0
3,4,201802,10,3,1,9,3,2,0,0,0,0,0,0,0,False,0
4,5,201804,10,5,1,10,1,2,0,0,0,0,0,0,0,False,0


In [141]:
data_prep_3 = data_prep_3[data_prep_3['hit_sum'] == 0]
new_bad_conc = data_prep_3['is_dlq_30d'].mean()

In [143]:
new_bad_conc

0.06280972139893301

In [149]:
passing_rate_old = data_prep.shape[0]/loan_data.shape[0]

In [151]:
passing_rate_old

0.90494

In [153]:
passing_rate_new = data_prep_3.shape[0]/loan_data.shape[0]

In [155]:
passing_rate_new

0.8435