In [2]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import zscore
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import make_column_transformer, make_column_selector

from transformode import DataFrameOneHotEncoder 

%matplotlib inline

Import the dataset

In [3]:
loan_df = pd.read_csv('Raw Data/Loan_Default.csv')

In [4]:
#Get details, but also check data types for processing
loan_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 148670 entries, 0 to 148669
Data columns (total 34 columns):
 #   Column                     Non-Null Count   Dtype  
---  ------                     --------------   -----  
 0   ID                         148670 non-null  int64  
 1   year                       148670 non-null  int64  
 2   loan_limit                 145326 non-null  object 
 3   Gender                     148670 non-null  object 
 4   approv_in_adv              147762 non-null  object 
 5   loan_type                  148670 non-null  object 
 6   loan_purpose               148536 non-null  object 
 7   Credit_Worthiness          148670 non-null  object 
 8   open_credit                148670 non-null  object 
 9   business_or_commercial     148670 non-null  object 
 10  loan_amount                148670 non-null  int64  
 11  rate_of_interest           112231 non-null  float64
 12  Interest_rate_spread       112031 non-null  float64
 13  Upfront_charges            10

In [5]:
y = loan_df['Status']
loan_df = loan_df.drop(columns=['Status', 'ID'])

In [6]:
loan_df['year'] = str(loan_df['year'])
cat = loan_df[make_column_selector(dtype_include= object)]
dfohe = DataFrameOneHotEncoder()
catx = dfohe.fit_transform(cat)

In [7]:
numdf = loan_df[make_column_selector(dtype_include= np.number)]
ss = StandardScaler()
numx = pd.DataFrame(ss.fit_transform(numdf), columns=numdf.columns)

transformer = make_column_transformer(
                (StandardScaler(), make_column_selector(dtype_include = np.number)),
                (DataFrameOneHotEncoder(), make_column_selector(dtype_include = object)))
X = transformer.fit_transform(loan_df)


In [8]:
X = numx.join(catx)

In [9]:
numx.isna().sum()

loan_amount                 0
rate_of_interest        36439
Interest_rate_spread    36639
Upfront_charges         39642
term                       41
property_value          15098
income                   9150
Credit_Score                0
LTV                     15098
dtir1                   24121
dtype: int64

In [10]:
missing_cols = ['rate_of_interest', 'Interest_rate_spread', 'Upfront_charges', 'term', 'property_value', 'income', 'LTV', 'dtir1']
X = X.fillna(value =0)

In [11]:
train_X, test_X, train_y, test_y = train_test_split(X, y, test_size = 0.2)

Logistic Regression

In [12]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score
from sklearn.metrics import recall_score

#the grid of parameters to search over
parameters = {
    'penalty' : ['l1','l2'], 
    'C'       : np.logspace(-3,3,7),
    'solver'  : ['newton-cg', 'lbfgs', 'liblinear'],
}


logreg = LogisticRegression(max_iter=1000)
clf = GridSearchCV(logreg,                    # model
                   param_grid = parameters,   # hyperparameters
                   scoring='accuracy',        # metric for scoring
                   cv=10) 


clf.fit(train_X,train_y)

print("Tuned Hyperparameters :", clf.best_params_)
print("Accuracy :",clf.best_score_)


140 fits failed out of a total of 420.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
70 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/kaylawilding/opt/anaconda3/envs/capstone/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/kaylawilding/opt/anaconda3/envs/capstone/lib/python3.10/site-packages/sklearn/linear_model/_logistic.py", line 1091, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "/Users/kaylawilding/opt/anaconda3/envs/capstone/lib/python3.10/site-packages/sklearn/linear_model/_logistic.py", line 61, in _check_solver
    raise ValueError(
ValueEr

Tuned Hyperparameters : {'C': 10.0, 'penalty': 'l2', 'solver': 'newton-cg'}
Accuracy : 0.8670041277055482


In [14]:
best_logreg = LogisticRegression(max_iter=1000, C=10, penalty='l2', solver = 'newton-cg')
best_logreg.fit(train_X, train_y)
y_pred = best_logreg.predict(test_X)

print("Accuracy:",best_logreg.score(test_X, test_y))

Accuracy: 0.8688033900585189


In [16]:
from sklearn.tree import DecisionTreeClassifier

decision_tree = DecisionTreeClassifier()

params = {
    "criterion" : ['gini', 'entropy'],
    "max_depth" : range(1,10),
    "min_samples_split" : range(2,10),
    "min_samples_leaf": range(1,5)
}
grid = GridSearchCV(decision_tree, 
                param_grid = params, 
                cv = 10,
                verbose = 1,
                n_jobs = -1)

grid.fit(train_X, train_y)
            

Fitting 10 folds for each of 576 candidates, totalling 5760 fits




In [17]:
grid.best_estimator_

In [18]:
grid.best_params_

{'criterion': 'entropy',
 'max_depth': 3,
 'min_samples_leaf': 1,
 'min_samples_split': 3}

The best Decision Tree Classifier is criterion 'entropy', depth = 3, and min sample split = 3

In [21]:
best_tree_class = DecisionTreeClassifier(criterion = 'entropy', max_depth = 3, min_samples_split =3, min_samples_leaf = 1)
best_tree_class.fit(train_X, train_y)

y_pred_tree = best_tree_class.predict(test_X)

print("Accuracy:",best_tree_class.score(test_X, test_y))

Accuracy: 1.0
