In [2]:
# Loading packages
import pandas as pd
from pandas.api.types import is_numeric_dtype
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
# Loading the dataset
loan_df = pd.read_csv('../raw_data/treated_df.csv')
df_backup = loan_df.copy()
loan_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 466285 entries, 0 to 466284
Data columns (total 35 columns):
 #   Column                      Non-Null Count   Dtype  
---  ------                      --------------   -----  
 0   loan_amnt                   466285 non-null  int64  
 1   funded_amnt                 466285 non-null  int64  
 2   funded_amnt_inv             466285 non-null  float64
 3   term                        466285 non-null  object 
 4   int_rate                    466285 non-null  float64
 5   installment                 466285 non-null  float64
 6   grade                       466285 non-null  object 
 7   emp_length                  466285 non-null  object 
 8   home_ownership              466285 non-null  object 
 9   annual_inc                  466281 non-null  float64
 10  purpose                     466285 non-null  object 
 11  dti                         466285 non-null  float64
 12  delinq_2yrs                 466256 non-null  float64
 13  inq_last_6mths

# Encoding features

In [4]:
# Separating columns by preprocessing type

cols_cat = loan_df.select_dtypes('object').columns

cols_robust = ['total_rev_hi_lim', 'int_rate', 'installment', 'annual_inc', 'dti', 'delinq_2yrs', 'inq_last_6mths', 'open_acc', 'pub_rec', 'revol_bal', 'revol_util', 'total_acc', 'out_prncp', 'out_prncp_inv', 'total_pymnt', 'total_pymnt_inv', 'total_rec_prncp', 'total_rec_int', 'total_rec_late_fee', 'recoveries', 'collection_recovery_fee', 'last_pymnt_amnt', 'collections_12_mths_ex_med', 'tot_coll_amt', 'acc_now_delinq', 'tot_cur_bal', 'total_rev_hi_lim']

cols_std = ['loan_amnt', 'funded_amnt', 'funded_amnt_inv']

In [5]:
X = loan_df

In [6]:
#### Treating features
# Importing packages
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import RobustScaler

# Standardizing continuous features
X['loan_amnt'], X['funded_amnt'], X['funded_amnt_inv'] =  StandardScaler().fit_transform(X[['loan_amnt','funded_amnt','funded_amnt_inv']]).T

# MinMaxing continuous features with outliers
X['total_rev_hi_lim'], X['int_rate'], X['installment'], X['annual_inc'], X['dti'], X['delinq_2yrs'], X['inq_last_6mths'], X['open_acc'], X['pub_rec'], X['revol_bal'], X['revol_util'], X['total_acc'], X['out_prncp'], X['out_prncp_inv'], X['total_pymnt'], X['total_pymnt_inv'], X['total_rec_prncp'], X['total_rec_int'], X['total_rec_late_fee'], X['recoveries'], X['collection_recovery_fee'], X['last_pymnt_amnt'], X['collections_12_mths_ex_med'], X['tot_coll_amt'], X['acc_now_delinq'], X['tot_cur_bal'], X['total_rev_hi_lim'] =  RobustScaler().fit_transform(X[['total_rev_hi_lim', 'int_rate', 'installment', 'annual_inc', 'dti', 'delinq_2yrs', 'inq_last_6mths', 'open_acc', 'pub_rec', 'revol_bal', 'revol_util', 'total_acc', 'out_prncp', 'out_prncp_inv', 'total_pymnt', 'total_pymnt_inv', 'total_rec_prncp', 'total_rec_int', 'total_rec_late_fee', 'recoveries', 'collection_recovery_fee', 'last_pymnt_amnt', 'collections_12_mths_ex_med', 'tot_coll_amt', 'acc_now_delinq', 'tot_cur_bal', 'total_rev_hi_lim']]).T

X.head()

Unnamed: 0,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,installment,grade,emp_length,home_ownership,annual_inc,...,total_rec_late_fee,recoveries,collection_recovery_fee,last_pymnt_amnt,collections_12_mths_ex_med,acc_now_delinq,tot_coll_amt,tot_cur_bal,total_rev_hi_lim,good_bad
0,-1.124392,-1.122963,-1.114455,36 months,-0.547273,-0.700313,B,10 years or more,RENT,-0.88717,...,0.0,0.0,0.0,-0.13021,0.0,0.0,,,,1
1,-1.426088,-1.425101,-1.412732,60 months,0.292727,-1.032818,C,0,RENT,-0.750682,...,0.0,117.08,1.11,-0.148284,0.0,0.0,,,,0
2,-1.438156,-1.437186,-1.424784,36 months,0.418182,-0.953758,C,10 years or more,RENT,-1.154413,...,0.0,0.0,0.0,0.036158,0.0,0.0,,,,1
3,-0.521001,-0.518687,-0.50886,36 months,-0.030909,-0.13095,C,10 years or more,RENT,-0.313922,...,16.97,0.0,0.0,-0.065561,0.0,0.0,,,,1
4,-1.365749,-1.364673,-1.352474,60 months,-0.176364,-1.007132,B,1 year,RENT,0.386715,...,0.0,0.0,0.0,-0.166326,0.0,0.0,,,,1


# Logit for Default probability

In [14]:
X.columns

Index(['loan_amnt', 'funded_amnt', 'funded_amnt_inv', 'term', 'int_rate',
       'installment', 'grade', 'emp_length', 'home_ownership', 'annual_inc',
       'purpose', 'dti', 'delinq_2yrs', 'inq_last_6mths', 'open_acc',
       'pub_rec', 'revol_bal', 'revol_util', 'total_acc', 'out_prncp',
       'out_prncp_inv', 'total_pymnt', 'total_pymnt_inv', 'total_rec_prncp',
       'total_rec_int', 'total_rec_late_fee', 'recoveries',
       'collection_recovery_fee', 'last_pymnt_amnt',
       'collections_12_mths_ex_med', 'acc_now_delinq', 'tot_coll_amt',
       'tot_cur_bal', 'total_rev_hi_lim', 'good_bad'],
      dtype='object')

In [15]:
X.select_dtypes('object').columns

Index(['term', 'grade', 'emp_length', 'home_ownership', 'purpose'], dtype='object')

In [39]:
import statsmodels.formula.api as smf

# Dummy coding is likely the most well known coding scheme. It compares each level of the categorical variable to a base reference level. The base reference level is the value of the intercept. It is the default contrast in Patsy for unordered categorical factors. The Treatment contrast matrix for race would be



# Specifying the model
logit_1 = smf.logit(formula='''
                  good_bad ~ loan_amnt + C(term) + int_rate + installment + C(grade) + C(emp_length) + C(home_ownership) + annual_inc + C(purpose) + dti + total_pymnt + recoveries + acc_now_delinq + tot_cur_bal 
                  '''
                  , data = loan_df).fit(maxiter = 2000)

# funded_amnt + funded_amnt_inv + delinq_2yrs + inq_last_6mths + open_acc + pub_rec + revol_bal + revol_util + total_acc + out_prncp + out_prncp_inv + total_pymnt_inv + total_rec_prncp + total_rec_int + total_rec_late_fee + total_rev_hi_lim + last_pymnt_amnt + collection_recovery_fee + collections_12_mths_ex_med + tot_coll_amt 


  return 1/(1+np.exp(-X))


Optimization terminated successfully.
         Current function value: 0.161612
         Iterations 290


In [40]:
logit_1.summary()
# R2 = 0.5066

  return 1/(1+np.exp(-X))


0,1,2,3
Dep. Variable:,good_bad,No. Observations:,396009.0
Model:,Logit,Df Residuals:,395978.0
Method:,MLE,Df Model:,30.0
Date:,"Sat, 10 Jun 2023",Pseudo R-squ.:,0.5066
Time:,16:10:40,Log-Likelihood:,-64000.0
converged:,True,LL-Null:,-129720.0
Covariance Type:,nonrobust,LLR p-value:,0.0

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Intercept,1.6719,0.063,26.587,0.000,1.549,1.795
C(term)[T. 60 months],-0.3832,0.041,-9.329,0.000,-0.464,-0.303
C(grade)[T.B],1.3265,0.045,29.605,0.000,1.239,1.414
C(grade)[T.C],2.6914,0.059,45.351,0.000,2.575,2.808
C(grade)[T.D],4.2616,0.079,53.780,0.000,4.106,4.417
C(grade)[T.E],5.9799,0.102,58.605,0.000,5.780,6.180
C(grade)[T.F],7.9511,0.130,61.102,0.000,7.696,8.206
C(grade)[T.G],8.7608,0.155,56.691,0.000,8.458,9.064
C(emp_length)[T.1 year],0.0444,0.040,1.105,0.269,-0.034,0.123


Features that were NOT significant:

emp_length:
1 year, 2 to 4 years, 7 to 9 years, unemployed

home_ownership:
OTHER

dti

recoveries

In [32]:
# Removing features that were not significant and testing again (dti and recoveries)

# First spec:
# good_bad ~ loan_amnt + C(term) + int_rate + installment + C(grade) + C(emp_length) + C(home_ownership) + annual_inc + C(purpose) + dti + total_pymnt + recoveries + acc_now_delinq + tot_cur_bal

model1 = smf.logit(formula='''
                  good_bad ~ loan_amnt + C(term) + int_rate + installment + C(grade) + C(emp_length) + C(home_ownership) + annual_inc + C(purpose) + total_pymnt + acc_now_delinq +  tot_cur_bal
                  '''
                  , data=X).fit(maxiter = 2000)

model1.summary()

# Caiu muito o R² = 0.3054. Não remover essas variáveis!

# funded_amnt + funded_amnt_inv + delinq_2yrs + inq_last_6mths + open_acc + pub_rec + revol_bal + revol_util + total_acc + out_prncp + out_prncp_inv + total_pymnt_inv + total_rec_prncp + total_rec_int + total_rec_late_fee + total_rev_hi_lim + last_pymnt_amnt + collection_recovery_fee + collections_12_mths_ex_med + tot_coll_amt 

Optimization terminated successfully.
         Current function value: 0.227541
         Iterations 9


0,1,2,3
Dep. Variable:,good_bad,No. Observations:,396009.0
Model:,Logit,Df Residuals:,395980.0
Method:,MLE,Df Model:,28.0
Date:,"Sat, 10 Jun 2023",Pseudo R-squ.:,0.3054
Time:,14:46:03,Log-Likelihood:,-90108.0
converged:,True,LL-Null:,-129720.0
Covariance Type:,nonrobust,LLR p-value:,0.0

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Intercept,0.6579,0.051,12.834,0.000,0.557,0.758
C(term)[T. 60 months],-0.3267,0.034,-9.667,0.000,-0.393,-0.260
C(grade)[T.B],1.6215,0.037,44.056,0.000,1.549,1.694
C(grade)[T.C],3.2225,0.049,66.110,0.000,3.127,3.318
C(grade)[T.D],5.0212,0.065,77.395,0.000,4.894,5.148
C(grade)[T.E],6.9335,0.083,83.449,0.000,6.771,7.096
C(grade)[T.F],8.9086,0.105,85.170,0.000,8.704,9.114
C(grade)[T.G],9.9650,0.125,79.551,0.000,9.719,10.211
C(emp_length)[T.1 year],0.0240,0.033,0.730,0.465,-0.040,0.089


In [33]:
# Model 2: trying more features

# First spec:
# good_bad ~ loan_amnt + C(term) + int_rate + installment + C(grade) + C(emp_length) + C(home_ownership) + annual_inc + C(purpose) + dti + total_pymnt + recoveries + acc_now_delinq + tot_cur_bal

# Specifying the model
model2 = smf.logit(formula='''
                  good_bad ~ loan_amnt + C(term) + int_rate + installment + C(grade) + C(emp_length) + C(home_ownership) + annual_inc + delinq_2yrs + C(purpose) + dti + total_pymnt + recoveries + acc_now_delinq + tot_cur_bal + revol_bal + total_pymnt_inv + total_rec_prncp
                  '''
                  , data=X).fit(maxiter = 2000)

# Outras variáveis que podem ser úteis
# delinq_2yrs + inq_last_6mths + pub_rec + revol_util + out_prncp

  return 1/(1+np.exp(-X))


         Current function value: 0.160477
         Iterations: 2000




In [34]:
model2.summary()

# R2 = 0.5101 -> aumentou mt pouco: 0.0034. Não justifica a inclusão das outras variáveis.

  return 1/(1+np.exp(-X))


0,1,2,3
Dep. Variable:,good_bad,No. Observations:,396009.0
Model:,Logit,Df Residuals:,395974.0
Method:,MLE,Df Model:,34.0
Date:,"Sat, 10 Jun 2023",Pseudo R-squ.:,0.5101
Time:,14:55:43,Log-Likelihood:,-63550.0
converged:,False,LL-Null:,-129720.0
Covariance Type:,nonrobust,LLR p-value:,0.0

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Intercept,1.3748,0.069,19.818,0.000,1.239,1.511
C(term)[T. 60 months],-0.1305,0.043,-3.052,0.002,-0.214,-0.047
C(grade)[T.B],1.3375,0.045,29.783,0.000,1.249,1.426
C(grade)[T.C],2.6875,0.059,45.235,0.000,2.571,2.804
C(grade)[T.D],4.2782,0.079,53.876,0.000,4.123,4.434
C(grade)[T.E],6.0660,0.102,59.266,0.000,5.865,6.267
C(grade)[T.F],8.1198,0.131,62.178,0.000,7.864,8.376
C(grade)[T.G],9.0698,0.155,58.652,0.000,8.767,9.373
C(emp_length)[T.1 year],0.0457,0.040,1.137,0.255,-0.033,0.125


#### Especificação usada:

logit_1:

good_bad ~ loan_amnt + C(term) + int_rate + installment + C(grade) + C(emp_length) + C(home_ownership) + annual_inc + C(purpose) + dti + total_pymnt + recoveries + acc_now_delinq + tot_cur_bal

👉 Use 
accuracy when:

Target classes are balanced
Prediction of each class is equally important

# Estimating Interest Rates via Linear Regression

In [8]:
X.columns

Index(['loan_amnt', 'funded_amnt', 'funded_amnt_inv', 'term', 'int_rate',
       'installment', 'grade', 'emp_length', 'home_ownership', 'annual_inc',
       'purpose', 'dti', 'delinq_2yrs', 'inq_last_6mths', 'open_acc',
       'pub_rec', 'revol_bal', 'revol_util', 'total_acc', 'out_prncp',
       'out_prncp_inv', 'total_pymnt', 'total_pymnt_inv', 'total_rec_prncp',
       'total_rec_int', 'total_rec_late_fee', 'recoveries',
       'collection_recovery_fee', 'last_pymnt_amnt',
       'collections_12_mths_ex_med', 'acc_now_delinq', 'tot_coll_amt',
       'tot_cur_bal', 'total_rev_hi_lim', 'good_bad'],
      dtype='object')

In [13]:
X_lin = X[['loan_amnt', 'term', 'grade', 'installment', 'emp_length', 'home_ownership', 'annual_inc', 'purpose', 'delinq_2yrs', 'acc_now_delinq', 'total_rev_hi_lim']]

y = X['int_rate']

In [17]:
X_lin['term'].unique()
# grade
# emp_length
# home_ownership
# purpose

array([' 36 months', ' 60 months'], dtype=object)

In [20]:
X_lin.head()

Unnamed: 0,loan_amnt,term,grade,installment,emp_length,home_ownership,annual_inc,purpose,delinq_2yrs,acc_now_delinq,total_rev_hi_lim
0,-1.124392,36 months,B,-0.700313,10 years or more,RENT,-0.88717,credit card,0.0,0.0,
1,-1.426088,60 months,C,-1.032818,0,RENT,-0.750682,"home improvement, major purchase or car",0.0,0.0,
2,-1.438156,36 months,C,-0.953758,10 years or more,RENT,-1.154413,small business or educational,0.0,0.0,
3,-0.521001,36 months,C,-0.13095,10 years or more,RENT,-0.313922,"renewable energy, moving, house or other",0.0,0.0,
4,-1.365749,60 months,B,-1.007132,1 year,RENT,0.386715,"renewable energy, moving, house or other",0.0,0.0,


In [28]:
# Encoding categorical features
from sklearn.preprocessing import OneHotEncoder

ohe = OneHotEncoder(handle_unknown = 'ignore', sparse = False)

# Encoding categorical features


X_lin[ohe.get_feature_names_out()] = ohe.fit_transform(X_lin[['term', 'grade', 'emp_length', 'home_ownership', 'purpose']])
# X_lin[ohe.get_feature_names_out()] = ohe.transform(X_lin[['term']])
# X_lin.drop(columns = ['term'], inplace = True)

X_lin.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_lin[ohe.get_feature_names_out()] = ohe.fit_transform(X_lin[['term', 'grade', 'emp_length', 'home_ownership', 'purpose']])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_lin[ohe.get_feature_names_out()] = ohe.fit_transform(X_lin[['term', 'grade', 'emp_length', 'home_ownership', 'purpose']])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/inde

Unnamed: 0,loan_amnt,term,grade,installment,emp_length,home_ownership,annual_inc,purpose,delinq_2yrs,acc_now_delinq,...,home_ownership_MORTGAGE,home_ownership_OTHER,home_ownership_OWN,home_ownership_RENT,purpose_credit card,purpose_debt consolidation,"purpose_home improvement, major purchase or car","purpose_medical, wedding or vacation","purpose_renewable energy, moving, house or other",purpose_small business or educational
0,-1.124392,36 months,B,-0.700313,10 years or more,RENT,-0.88717,credit card,0.0,0.0,...,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0
1,-1.426088,60 months,C,-1.032818,0,RENT,-0.750682,"home improvement, major purchase or car",0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
2,-1.438156,36 months,C,-0.953758,10 years or more,RENT,-1.154413,small business or educational,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
3,-0.521001,36 months,C,-0.13095,10 years or more,RENT,-0.313922,"renewable energy, moving, house or other",0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
4,-1.365749,60 months,B,-1.007132,1 year,RENT,0.386715,"renewable energy, moving, house or other",0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0


In [29]:
X_lin.drop(columns = ['term', 'grade', 'emp_length', 'home_ownership', 'purpose'], inplace = True)

X_lin.head()

Unnamed: 0,loan_amnt,installment,annual_inc,delinq_2yrs,acc_now_delinq,total_rev_hi_lim,term_ 36 months,term_ 60 months,grade_A,grade_B,...,home_ownership_MORTGAGE,home_ownership_OTHER,home_ownership_OWN,home_ownership_RENT,purpose_credit card,purpose_debt consolidation,"purpose_home improvement, major purchase or car","purpose_medical, wedding or vacation","purpose_renewable energy, moving, house or other",purpose_small business or educational
0,-1.124392,-0.700313,-0.88717,0.0,0.0,,1.0,0.0,0.0,1.0,...,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0
1,-1.426088,-1.032818,-0.750682,0.0,0.0,,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
2,-1.438156,-0.953758,-1.154413,0.0,0.0,,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
3,-0.521001,-0.13095,-0.313922,0.0,0.0,,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
4,-1.365749,-1.007132,0.386715,0.0,0.0,,0.0,1.0,0.0,1.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0


In [33]:
import statsmodels.api as sm

X = sm.add_constant(X_lin)
int_model = sm.OLS(y, X_lin, missing = 'drop')
results = int_model.fit()

print(results.summary())

                            OLS Regression Results                            
Dep. Variable:               int_rate   R-squared:                       0.930
Model:                            OLS   Adj. R-squared:                  0.930
Method:                 Least Squares   F-statistic:                 1.946e+05
Date:                Sat, 10 Jun 2023   Prob (F-statistic):               0.00
Time:                        15:56:00   Log-Likelihood:                 54672.
No. Observations:              396009   AIC:                        -1.093e+05
Df Residuals:                  395981   BIC:                        -1.090e+05
Df Model:                          27                                         
Covariance Type:            nonrobust                                         
                                                       coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------------------------------

NameError: name 'y' is not defined

# Neural Network to determine Default