In [2]:
import numpy as np
import pandas as pd
import warnings
import os
from pathlib import Path
import matplotlib.pyplot as plt
from sklearn.metrics import r2_score
from scipy.stats import chi2_contingency
from scipy.stats import f_oneway
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, precision_recall_fscore_support,confusion_matrix




# Internal Data
 

In [3]:
Internal_path = Path.cwd().parent / "data"/ "interim" / "case_study1.xlsx"


# Cibil External data

In [4]:
External_path = Path.cwd().parent / "data"/  "external" / "case_study2.xlsx"

In [5]:
df1 = pd.read_excel(External_path)
df2 = pd.read_excel(Internal_path)

 # Internal Data clean up
 

In [6]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 51336 entries, 0 to 51335
Data columns (total 62 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   PROSPECTID                    51336 non-null  int64  
 1   time_since_recent_payment     51336 non-null  int64  
 2   time_since_first_deliquency   51336 non-null  int64  
 3   time_since_recent_deliquency  51336 non-null  int64  
 4   num_times_delinquent          51336 non-null  int64  
 5   max_delinquency_level         51336 non-null  int64  
 6   max_recent_level_of_deliq     51336 non-null  int64  
 7   num_deliq_6mts                51336 non-null  int64  
 8   num_deliq_12mts               51336 non-null  int64  
 9   num_deliq_6_12mts             51336 non-null  int64  
 10  max_deliq_6mts                51336 non-null  int64  
 11  max_deliq_12mts               51336 non-null  int64  
 12  num_times_30p_dpd             51336 non-null  int64  
 13  n

In [7]:
def null_value(df):
    columns_remove_list = []
    for i in df.columns:
        if df.loc[df[i] == -99999].shape[0] > 10000:
            columns_remove_list.append(i)
    return columns_remove_list

In [8]:
null_value(df1)

['time_since_first_deliquency',
 'time_since_recent_deliquency',
 'max_delinquency_level',
 'max_deliq_6mts',
 'max_deliq_12mts',
 'CC_utilization',
 'PL_utilization',
 'max_unsec_exposure_inPct']

In [9]:
def clean_data(df):
    return (
        df
        .drop(null_value(df),axis=1)
        .replace(-99999, np.nan)  # Replace -99999 with NaN
        .dropna()
    )#.isna().sum()
   

In [10]:
clean_data(df1)

Unnamed: 0,PROSPECTID,time_since_recent_payment,num_times_delinquent,max_recent_level_of_deliq,num_deliq_6mts,num_deliq_12mts,num_deliq_6_12mts,num_times_30p_dpd,num_times_60p_dpd,num_std,...,pct_PL_enq_L6m_of_L12m,pct_CC_enq_L6m_of_L12m,pct_PL_enq_L6m_of_ever,pct_CC_enq_L6m_of_ever,HL_Flag,GL_Flag,last_prod_enq2,first_prod_enq2,Credit_Score,Approved_Flag
0,1,549.0,11,29,0,0,0,0,0,21,...,0.0,0.0,0.000,0.0,1,0,PL,PL,696,P2
1,2,47.0,0,0,0,0,0,0,0,0,...,0.0,0.0,0.000,0.0,0,0,ConsumerLoan,ConsumerLoan,685,P2
2,3,302.0,9,25,1,9,8,0,0,10,...,0.0,0.0,0.000,0.0,1,0,ConsumerLoan,others,693,P2
4,5,583.0,0,0,0,0,0,0,0,53,...,0.0,0.0,0.000,0.0,0,0,AL,AL,753,P1
5,6,245.0,14,270,0,0,0,13,11,5,...,1.0,0.0,0.429,0.0,1,0,ConsumerLoan,PL,668,P3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
51331,51332,15.0,2,24,0,0,0,0,0,0,...,0.0,0.0,0.000,0.0,0,0,ConsumerLoan,ConsumerLoan,650,P4
51332,51333,57.0,0,0,0,0,0,0,0,6,...,0.0,0.0,0.000,0.0,0,0,others,others,702,P1
51333,51334,32.0,0,0,0,0,0,0,0,0,...,1.0,0.0,1.000,0.0,0,0,ConsumerLoan,others,661,P3
51334,51335,58.0,0,0,0,0,0,0,0,0,...,0.0,0.0,0.000,0.0,0,0,ConsumerLoan,others,686,P2


In [11]:
clean_data(df2).columns

Index(['PROSPECTID', 'Total_TL', 'Tot_Closed_TL', 'Tot_Active_TL',
       'Total_TL_opened_L6M', 'Tot_TL_closed_L6M', 'pct_tl_open_L6M',
       'pct_tl_closed_L6M', 'pct_active_tl', 'pct_closed_tl',
       'Total_TL_opened_L12M', 'Tot_TL_closed_L12M', 'pct_tl_open_L12M',
       'pct_tl_closed_L12M', 'Tot_Missed_Pmnt', 'Auto_TL', 'CC_TL',
       'Consumer_TL', 'Gold_TL', 'Home_TL', 'PL_TL', 'Secured_TL',
       'Unsecured_TL', 'Other_TL', 'Age_Oldest_TL', 'Age_Newest_TL'],
      dtype='object')

In [12]:
for i in clean_data(df1).columns:
    if i in  clean_data(df2).columns:
        print(i)
    

PROSPECTID


In [13]:
df = pd.merge(clean_data(df1),clean_data(df2),how='inner',left_on= ["PROSPECTID"],right_on=['PROSPECTID'])

In [14]:
df

Unnamed: 0,PROSPECTID,time_since_recent_payment,num_times_delinquent,max_recent_level_of_deliq,num_deliq_6mts,num_deliq_12mts,num_deliq_6_12mts,num_times_30p_dpd,num_times_60p_dpd,num_std,...,CC_TL,Consumer_TL,Gold_TL,Home_TL,PL_TL,Secured_TL,Unsecured_TL,Other_TL,Age_Oldest_TL,Age_Newest_TL
0,1,549.0,11,29,0,0,0,0,0,21,...,0,0,1,0,4,1,4,0,72.0,18.0
1,2,47.0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,1,0,7.0,7.0
2,3,302.0,9,25,1,9,8,0,0,10,...,0,6,1,0,0,2,6,0,47.0,2.0
3,5,583.0,0,0,0,0,0,0,0,53,...,0,0,0,0,0,3,0,2,131.0,32.0
4,6,245.0,14,270,0,0,0,13,11,5,...,0,0,2,0,0,6,0,0,150.0,17.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
42059,51332,15.0,2,24,0,0,0,0,0,0,...,0,2,0,0,0,0,3,1,24.0,5.0
42060,51333,57.0,0,0,0,0,0,0,0,6,...,0,2,0,0,0,2,2,0,74.0,7.0
42061,51334,32.0,0,0,0,0,0,0,0,0,...,0,2,0,0,0,0,2,0,9.0,5.0
42062,51335,58.0,0,0,0,0,0,0,0,0,...,0,2,0,0,0,0,2,0,15.0,8.0


In [15]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 42064 entries, 0 to 42063
Data columns (total 79 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   PROSPECTID                  42064 non-null  int64  
 1   time_since_recent_payment   42064 non-null  float64
 2   num_times_delinquent        42064 non-null  int64  
 3   max_recent_level_of_deliq   42064 non-null  int64  
 4   num_deliq_6mts              42064 non-null  int64  
 5   num_deliq_12mts             42064 non-null  int64  
 6   num_deliq_6_12mts           42064 non-null  int64  
 7   num_times_30p_dpd           42064 non-null  int64  
 8   num_times_60p_dpd           42064 non-null  int64  
 9   num_std                     42064 non-null  int64  
 10  num_std_6mts                42064 non-null  int64  
 11  num_std_12mts               42064 non-null  int64  
 12  num_sub                     42064 non-null  int64  
 13  num_sub_6mts                420

In [16]:
df.head()

Unnamed: 0,PROSPECTID,time_since_recent_payment,num_times_delinquent,max_recent_level_of_deliq,num_deliq_6mts,num_deliq_12mts,num_deliq_6_12mts,num_times_30p_dpd,num_times_60p_dpd,num_std,...,CC_TL,Consumer_TL,Gold_TL,Home_TL,PL_TL,Secured_TL,Unsecured_TL,Other_TL,Age_Oldest_TL,Age_Newest_TL
0,1,549.0,11,29,0,0,0,0,0,21,...,0,0,1,0,4,1,4,0,72.0,18.0
1,2,47.0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,1,0,7.0,7.0
2,3,302.0,9,25,1,9,8,0,0,10,...,0,6,1,0,0,2,6,0,47.0,2.0
3,5,583.0,0,0,0,0,0,0,0,53,...,0,0,0,0,0,3,0,2,131.0,32.0
4,6,245.0,14,270,0,0,0,13,11,5,...,0,0,2,0,0,6,0,0,150.0,17.0


# chi2_contingency

In [17]:
categorical_columns = (df.select_dtypes(include=["O"])
 .columns
 .drop("Approved_Flag"))

In [18]:
categorical_columns

Index(['MARITALSTATUS', 'EDUCATION', 'GENDER', 'last_prod_enq2',
       'first_prod_enq2'],
      dtype='object')

## criteria 

if pval is less than or equal to <= 0.05 we accept

In [19]:
for i in categorical_columns:
    chai,pval,_,_ = chi2_contingency(pd.crosstab(df[i],df['Approved_Flag']))
    print(i ," --- ",pval )

MARITALSTATUS  ---  3.578180861038862e-233
EDUCATION  ---  2.6942265249737532e-30
GENDER  ---  1.907936100186563e-05
last_prod_enq2  ---  0.0
first_prod_enq2  ---  7.84997610555419e-287


# vif check

In [20]:
numerical_columns = (df.select_dtypes(exclude="O")
 .columns
 .drop("PROSPECTID","Approved_Flag"))

In [21]:
len(numerical_columns)

72

In [22]:
vif_df = df[numerical_columns]
columns_to_be_kept = []
columns_index = 0 

In [23]:
for i in range(0, vif_df.shape[1]):
    vif_score = variance_inflation_factor(vif_df,columns_index)
    print( i,"---" ,vif_score)
    
    if vif_score <= 6:
        columns_to_be_kept.append(numerical_columns[i])
        columns_index = columns_index+1
    else:
        vif_df = vif_df.drop([numerical_columns[i]],axis=1)

0 --- 2.735596362883496
1 --- 6.961314467465788
2 --- 4.925184834688885


  vif = 1. / (1. - r_squared_i)


3 --- inf
4 --- 6.650799569985378
5 --- 1.3261593810039676
6 --- 7.653071644611269
7 --- 1.5935715009903269
8 --- 6.296462262702086
9 --- 13.653414420768309
10 --- 2.24601870979017
11 --- 1.499652196823596
12 --- 2.1710876929486034
13 --- 2.6200918994860745
14 --- 2.2943371922808034
15 --- 7.359219543257251
16 --- 2.159726008571296
17 --- 2.86791403772506
18 --- 6.457158592091426
19 --- 2.8475106760699984
20 --- 4.50444819658257
21 --- 11.921106134592375
22 --- 5.982936215668196
23 --- 8.50376169154305
24 --- 5.173545596769714
25 --- 7.330664304375385
26 --- 11.511382400362313
27 --- 3.0072075526966158
28 --- 1.5997364012032687
29 --- 12.927000883758865
30 --- 7.9954118163524335
31 --- 3.0785991090119182
32 --- 1.4103842150984582
33 --- 1.049342850511928
34 --- 1.2689983441211283
35 --- inf


  vif = 1. / (1. - r_squared_i)


36 --- 5.075998030360695
37 --- 1.000928171040862
38 --- 2.847983163098133
39 --- 2.3163407816170465
40 --- 15.849154197263223
41 --- 16.39374389295382
42 --- 1.5464122198584578
43 --- 1.5038859167469592
44 --- 1.7845232237212945
45 --- 4.099306113957539
46 --- 5.396294847902337
47 --- inf


  vif = 1. / (1. - r_squared_i)
  vif = 1. / (1. - r_squared_i)


48 --- inf
49 --- 11.07280221305783
50 --- 8.06788937832173
51 --- 6.427524985814936
52 --- 5.009710397629791
53 --- 2.595623307158872
54 --- 2572.050081635326
55 --- 7.300507771993867
56 --- 7.709307236261906
57 --- 3.788104917882002
58 --- 5.007074482679259
59 --- 4.457295407081155
60 --- 1.944133775053528
61 --- inf


  vif = 1. / (1. - r_squared_i)


62 --- 4.711300840335142
63 --- 22.607287499893975
64 --- 24.992957498610757
65 --- 4.358048547759654
66 --- 2.781269487415352
67 --- 2.568970862401073
68 --- 3.8269353128182453
69 --- 2.046609871897926
70 --- 4.496792209791075
71 --- 5.685555077402055


In [24]:
len(columns_to_be_kept)

44

In [25]:
categorical_columns

Index(['MARITALSTATUS', 'EDUCATION', 'GENDER', 'last_prod_enq2',
       'first_prod_enq2'],
      dtype='object')

# chaking anova


In [26]:
num_columns_to_kept = []

In [27]:
for  i in columns_to_be_kept:
   a = list(df[i])
   b = list(df["Approved_Flag"])
   
   group_P1 = [value for value, group in zip(a,b) if group == "P1"]
   group_P2 = [value for value, group in zip(a,b) if group == "P2"]
   group_P3 = [value for value, group in zip(a,b) if group == "P3"]
   group_P4 = [value for value, group in zip(a,b) if group == "P4"]
   
   f_statistic , p_value  = f_oneway(group_P1,group_P2,group_P3,group_P4)
   
   if p_value <= 0.5:
      num_columns_to_kept.append(i)
      


In [28]:
len(num_columns_to_kept)

43

In [29]:
all_features = list(num_columns_to_kept) + list(categorical_columns)

df = df[all_features + ["Approved_Flag"]]

In [30]:
df.shape

(42064, 49)

## categorical columns value chuck

In [31]:
for i in categorical_columns:
    print(i,"--",df[i].unique())

MARITALSTATUS -- ['Married' 'Single']
EDUCATION -- ['12TH' 'GRADUATE' 'SSC' 'POST-GRADUATE' 'UNDER GRADUATE' 'OTHERS'
 'PROFESSIONAL']
GENDER -- ['M' 'F']
last_prod_enq2 -- ['PL' 'ConsumerLoan' 'AL' 'CC' 'others' 'HL']
first_prod_enq2 -- ['PL' 'ConsumerLoan' 'others' 'AL' 'HL' 'CC']


## EDUCATION label encoder

* SSC              : 1
* 12TH             : 2
* GRADUATE         : 3
* UNDER GRADUATE   : 3
* POST-GRADUATE    : 4
* OTHERS           : 1
* PROFESSIONAL     : 3

In [32]:


df.loc[df["EDUCATION"] == 'SSC',['EDUCATION']] = 1
df.loc[df["EDUCATION"] == '12TH',['EDUCATION']] = 2
df.loc[df["EDUCATION"] == 'GRADUATE',['EDUCATION']] = 3
df.loc[df["EDUCATION"] == 'POST-GRADUATE',['EDUCATION']] = 4
df.loc[df["EDUCATION"] == 'UNDER GRADUATE',['EDUCATION']] = 3
df.loc[df["EDUCATION"] == 'OTHERS',['EDUCATION']] = 1
df.loc[df["EDUCATION"] == 'PROFESSIONAL',['EDUCATION']] = 3

In [33]:
df['EDUCATION'].unique()

array([2, 3, 1, 4], dtype=object)

In [34]:
df['EDUCATION'].value_counts()
df['EDUCATION'] = df['EDUCATION'].astype(int)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['EDUCATION'] = df['EDUCATION'].astype(int)


In [35]:
df['EDUCATION'].info()

<class 'pandas.core.series.Series'>
RangeIndex: 42064 entries, 0 to 42063
Series name: EDUCATION
Non-Null Count  Dtype
--------------  -----
42064 non-null  int32
dtypes: int32(1)
memory usage: 164.4 KB


In [36]:
list(categorical_columns.drop("EDUCATION"))

['MARITALSTATUS', 'GENDER', 'last_prod_enq2', 'first_prod_enq2']

In [37]:
df_encoded = pd.get_dummies(df,columns=['MARITALSTATUS', 'GENDER', 'last_prod_enq2', 'first_prod_enq2'],drop_first=True)

In [38]:
df_encoded.head()

Unnamed: 0,time_since_recent_payment,max_recent_level_of_deliq,num_deliq_6_12mts,num_times_60p_dpd,num_std_12mts,num_sub,num_sub_6mts,num_sub_12mts,num_dbt,num_dbt_12mts,...,last_prod_enq2_CC,last_prod_enq2_ConsumerLoan,last_prod_enq2_HL,last_prod_enq2_PL,last_prod_enq2_others,first_prod_enq2_CC,first_prod_enq2_ConsumerLoan,first_prod_enq2_HL,first_prod_enq2_PL,first_prod_enq2_others
0,549.0,29,0,0,11,0,0,0,0,0,...,False,False,False,True,False,False,False,False,True,False
1,47.0,0,0,0,0,0,0,0,0,0,...,False,True,False,False,False,False,True,False,False,False
2,302.0,25,8,0,10,0,0,0,0,0,...,False,True,False,False,False,False,False,False,False,True
3,583.0,0,0,0,16,0,0,0,0,0,...,False,False,False,False,False,False,False,False,False,False
4,245.0,270,0,11,2,3,0,1,0,0,...,False,True,False,False,False,False,False,False,True,False


In [39]:
df_encoded.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 42064 entries, 0 to 42063
Data columns (total 57 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   time_since_recent_payment     42064 non-null  float64
 1   max_recent_level_of_deliq     42064 non-null  int64  
 2   num_deliq_6_12mts             42064 non-null  int64  
 3   num_times_60p_dpd             42064 non-null  int64  
 4   num_std_12mts                 42064 non-null  int64  
 5   num_sub                       42064 non-null  int64  
 6   num_sub_6mts                  42064 non-null  int64  
 7   num_sub_12mts                 42064 non-null  int64  
 8   num_dbt                       42064 non-null  int64  
 9   num_dbt_12mts                 42064 non-null  int64  
 10  num_lss                       42064 non-null  int64  
 11  num_lss_12mts                 42064 non-null  int64  
 12  recent_level_of_deliq         42064 non-null  int64  
 13  C

In [40]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 42064 entries, 0 to 42063
Data columns (total 49 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   time_since_recent_payment   42064 non-null  float64
 1   max_recent_level_of_deliq   42064 non-null  int64  
 2   num_deliq_6_12mts           42064 non-null  int64  
 3   num_times_60p_dpd           42064 non-null  int64  
 4   num_std_12mts               42064 non-null  int64  
 5   num_sub                     42064 non-null  int64  
 6   num_sub_6mts                42064 non-null  int64  
 7   num_sub_12mts               42064 non-null  int64  
 8   num_dbt                     42064 non-null  int64  
 9   num_dbt_12mts               42064 non-null  int64  
 10  num_lss                     42064 non-null  int64  
 11  num_lss_12mts               42064 non-null  int64  
 12  recent_level_of_deliq       42064 non-null  int64  
 13  CC_enq                      420

# machine  learning model fitting

In [41]:
for i in df_encoded.select_dtypes(include=["boolean"]).columns:
    df_encoded[i] = df_encoded[i].astype(int)

In [42]:
df_encoded.head()

Unnamed: 0,time_since_recent_payment,max_recent_level_of_deliq,num_deliq_6_12mts,num_times_60p_dpd,num_std_12mts,num_sub,num_sub_6mts,num_sub_12mts,num_dbt,num_dbt_12mts,...,last_prod_enq2_CC,last_prod_enq2_ConsumerLoan,last_prod_enq2_HL,last_prod_enq2_PL,last_prod_enq2_others,first_prod_enq2_CC,first_prod_enq2_ConsumerLoan,first_prod_enq2_HL,first_prod_enq2_PL,first_prod_enq2_others
0,549.0,29,0,0,11,0,0,0,0,0,...,0,0,0,1,0,0,0,0,1,0
1,47.0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,1,0,0,0
2,302.0,25,8,0,10,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,1
3,583.0,0,0,0,16,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,245.0,270,0,11,2,3,0,1,0,0,...,0,1,0,0,0,0,0,0,1,0


In [43]:
X = df_encoded.drop(columns=["Approved_Flag"], axis=1)
Y = df['Approved_Flag']

In [44]:
x_train,x_test,y_train,y_test = train_test_split(X,Y,test_size=0.2,random_state=42)

In [45]:
rf_classifier = RandomForestClassifier(n_estimators = 200, random_state=42)


In [46]:
rf_classifier.fit(x_train,y_train)

In [47]:
y_pred = rf_classifier.predict(x_test)


In [48]:
y_pred_train = rf_classifier.predict(x_train)

In [53]:
accuracy = accuracy_score(y_test,y_pred)
print("accuracy :",accuracy)

accuracy = accuracy_score(y_train,y_pred_train)
print("accuracy :",accuracy)

accuracy : 0.9901343159396172
accuracy : 1.0


: 

In [50]:
precision, recall , F1_score , _ = precision_recall_fscore_support(y_test,y_pred)

In [51]:
for i, v in enumerate(["P1","P2","P3","P4"]):
    print(f"Class {v}")
    print(f"Precision: {precision[i]}")
    print(f"Recall: {recall[i]}")
    print(f"F1_score: {F1_score[i]}")
    print()


Class P1
Precision: 0.9398496240601504
Recall: 0.9861932938856016
F1_score: 0.9624639076034649

Class P2
Precision: 0.9988121164125916
Recall: 1.0
F1_score: 0.999405705229794

Class P3
Precision: 0.9897557131599685
Recall: 0.9479245283018868
F1_score: 0.9683885890516577

Class P4
Precision: 1.0
Recall: 1.0
F1_score: 1.0



In [52]:
confusion_matrix(y_test,y_pred)

array([[1000,    1,   13,    0],
       [   0, 5045,    0,    0],
       [  64,    5, 1256,    0],
       [   0,    0,    0, 1029]], dtype=int64)