In [1]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import zscore
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import make_column_transformer, make_column_selector

from transformode import DataFrameOneHotEncoder 

%matplotlib inline

Import the dataset

In [2]:
loan_df = pd.read_csv('Raw Data/Loan_Default.csv')

In [3]:
#Get details, but also check data types for processing
loan_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 148670 entries, 0 to 148669
Data columns (total 34 columns):
 #   Column                     Non-Null Count   Dtype  
---  ------                     --------------   -----  
 0   ID                         148670 non-null  int64  
 1   year                       148670 non-null  int64  
 2   loan_limit                 145326 non-null  object 
 3   Gender                     148670 non-null  object 
 4   approv_in_adv              147762 non-null  object 
 5   loan_type                  148670 non-null  object 
 6   loan_purpose               148536 non-null  object 
 7   Credit_Worthiness          148670 non-null  object 
 8   open_credit                148670 non-null  object 
 9   business_or_commercial     148670 non-null  object 
 10  loan_amount                148670 non-null  int64  
 11  rate_of_interest           112231 non-null  float64
 12  Interest_rate_spread       112031 non-null  float64
 13  Upfront_charges            10

In [4]:
y = loan_df['Status']
loan_df = loan_df.drop(columns=['Status', 'ID'])

In [5]:
loan_df['year'] = str(loan_df['year'])
cat = loan_df[make_column_selector(dtype_include= object)]
dfohe = DataFrameOneHotEncoder()
catx = dfohe.fit_transform(cat)

In [6]:
numdf = loan_df[make_column_selector(dtype_include= np.number)]
ss = StandardScaler()
numx = pd.DataFrame(ss.fit_transform(numdf), columns=numdf.columns)

transformer = make_column_transformer(
                (StandardScaler(), make_column_selector(dtype_include = np.number)),
                (DataFrameOneHotEncoder(), make_column_selector(dtype_include = object)))
X = transformer.fit_transform(loan_df)


In [7]:
X = numx.join(catx)

In [8]:
numx.isna().sum()

loan_amount                 0
rate_of_interest        36439
Interest_rate_spread    36639
Upfront_charges         39642
term                       41
property_value          15098
income                   9150
Credit_Score                0
LTV                     15098
dtir1                   24121
dtype: int64

In [8]:
missing_cols = ['rate_of_interest', 'Interest_rate_spread', 'Upfront_charges', 'term', 'property_value', 'income', 'LTV', 'dtir1']
X = X.fillna(value =0)

In [9]:
train_X, test_X, train_y, test_y = train_test_split(X, y, test_size = 0.2)

Logistic Regression

In [35]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score
from sklearn.metrics import recall_score

#the grid of parameters to search over
Cs = [0.001, 0.1, 1, 10, 100]

max_score = 0
max_c = 0
max_test_sizes = 0
max_recall = 0
max_c_r = 0
max_test_sizes_r = 0

for c in Cs:
    clf = LogisticRegression(C = c, max_iter=1000)
    for test_sizes in [.2,.3,.4,.5,.6,.7,.8]:
        train_X, test_X, train_y, test_y = train_test_split(X, y, test_size = test_sizes)
        clf.fit(train_X, train_y)
        score = accuracy_score(clf.predict(test_X), test_y)
        if score >= max_score:
            max_score = score
            max_c = c
            max_test_sizes = test_sizes
        recall = recall_score(test_y  , clf.predict(test_X))
        if recall >= max_recall:
            max_recall = recall
            max_c_r = c
            max_test_sizes_r = test_sizes
print('The max score is', max_score, 'and the max c is', max_c, 'with test size of', max_test_sizes)
print('The max recall is', max_recall, 'and the max c is', max_c, 'with test size of', max_test_sizes)

The max score is 0.8729737001412524 and the max c is 1 with test size of 0.2
The max recall is 0.509366391184573 and the max c is 1 with test size of 0.2


In [16]:
y_logpred = clf.predict(test_X)

In [39]:
#Tree
from sklearn.tree import DecisionTreeClassifier

max_score = 0
max_d = 0
max_test_sizes = 0
max_recall = 0
max_d_r = 0
max_test_sizes_r = 0

for d in np.arange(3,15):
    tree = DecisionTreeClassifier(max_depth = d)
    tree  = clf.fit(train_X,train_y)
    score = accuracy_score(tree.predict(test_X), test_y)
    if score > max_score:
        max_score = score
        max_d = d
        max_test_sizes = test_sizes
    recall = recall_score(test_y  , clf.predict(test_X))
    if recall > max_recall:
        max_recall = recall
        max_d_r = d
        max_test_sizes_r = test_sizes
print('The max score is', max_score, 'and the max d is', max_d)
print('The max recall is', max_recall, 'and the max d is', max_d_r)


The max score is 0.869442389184099 and the max d is 3
The max recall is 0.5063364447775431 and the max d is 3
