In [1]:
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler

In [2]:
# Import the data
data = pd.read_csv('Resources/lending_data.csv')
data.tail()

Unnamed: 0,loan_size,interest_rate,borrower_income,debt_to_income,num_of_accounts,derogatory_marks,total_debt,loan_status
77531,19100.0,11.261,86600,0.65358,12,2,56600,1
77532,17700.0,10.662,80900,0.629172,11,2,50900,1
77533,17600.0,10.595,80300,0.626401,11,2,50300,1
77534,16300.0,10.068,75300,0.601594,10,2,45300,1
77535,15600.0,9.742,72300,0.585062,9,2,42300,1


In [3]:
# Check the shape of the dataset
data.shape

(77536, 8)

In [4]:
# Check for duplicates
duplicates = data[data.duplicated()]
duplicates.shape

(72307, 8)

## Prediction

This dataset is already preprocessed. There are duplicate values but that makes sense in the scope of this problem. It is a little suspicious that there are so many duplicate rows but since it is theoretically possible, it's best to not drop any data. Since all of the data is numeric, Logistic Regression should perform well. I suspect Random Forests to perform slightly better since there are many features involved and the Random Forest methon generally has the edge when we're comparing more variables.

In [5]:
# Split the data into X_train, X_test, y_train, y_test
X = data.drop('loan_status', axis = 1)
y = data['loan_status']
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 1)
scaler = StandardScaler().fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [6]:
# Train a Logistic Regression model print the model score
clf = LogisticRegression().fit(X_train_scaled, y_train)
print(f'Training Score: {clf.score(X_train_scaled, y_train)}')
print(f'Testing Score: {clf.score(X_test_scaled, y_test)}')

Training Score: 0.9942908240473243
Testing Score: 0.9936545604622369


In [7]:
# Train a Random Forest Classifier model and print the model score
clf = RandomForestClassifier(random_state = 0, n_estimators = 500).fit(X_train_scaled, y_train)
print(f'Training Score: {clf.score(X_train_scaled, y_train)}')
print(f'Testing Score: {clf.score(X_test_scaled, y_test)}')

Training Score: 0.9975409272252029
Testing Score: 0.9916941807676434


## Conclusion
Contrary to my prediction, it seems as though the logistic regression performed slightly better but only by .02%. It is likely that the random forests method would perform better after tweaking some of the parameters but since they are both receiving scores of 99% and logistic regression is much faster, it's unlikely that it would be worth it in this case.