In [29]:
import numpy as np
import pandas as pd
from pathlib import Path

In [30]:
train_df = pd.read_csv(Path('Resources/2019loans.csv'))
test_df = pd.read_csv(Path('Resources/2020Q1loans.csv'))

In [31]:
# Convert categorical data to numeric and separate target feature for training data
loan_risk_dict = {'low_risk': 0, 'high_risk': 1}
train_df['loan_status'] = train_df['loan_status'].replace(loan_risk_dict)

# train_df.head()

X_train = train_df.drop('loan_status', axis=1)
y_train = train_df['loan_status']
X_train = pd.get_dummies(X_train, drop_first=True)

X_train = X_train.drop('Unnamed: 0', axis=1)
# X_train.head()

columns = list(X_train.columns)

In [32]:
# Convert categorical data to numeric and separate target feature for testing data
test_df = test_df.replace({'loan_status':loan_risk_dict})
X_test = test_df.drop('loan_status', axis=1)
y_test = test_df['loan_status']
X_test = pd.get_dummies(X_test, drop_first=True)
X_test = X_test.drop('Unnamed: 0', axis=1)

In [33]:
# add missing dummy variables to testing set
X_test = X_test.reindex(labels=columns,axis=1)
X_test = X_test.fillna(0)
X_test.head()

Unnamed: 0,index,loan_amnt,int_rate,installment,annual_inc,dti,delinq_2yrs,inq_last_6mths,open_acc,pub_rec,...,total_il_high_credit_limit,home_ownership_MORTGAGE,home_ownership_OWN,home_ownership_RENT,verification_status_Source Verified,verification_status_Verified,initial_list_status_w,application_type_Joint App,hardship_flag_Y,debt_settlement_flag_Y
0,67991,40000.0,0.0819,814.7,140000.0,19.75,0.0,1.0,18.0,0.0,...,99475.0,1,0,0,0,0,1,0,0,0.0
1,25429,6000.0,0.1524,208.7,55000.0,11.52,2.0,0.0,8.0,0.0,...,23628.0,0,0,1,0,0,1,0,0,0.0
2,38496,3600.0,0.1695,128.27,42000.0,6.74,0.0,0.0,6.0,0.0,...,15000.0,0,0,1,0,0,1,0,0,0.0
3,19667,20000.0,0.1524,478.33,100000.0,12.13,0.0,2.0,7.0,0.0,...,35981.0,0,0,1,0,0,1,0,0,0.0
4,37505,3600.0,0.124,120.27,50000.0,16.08,0.0,3.0,6.0,0.0,...,24977.0,0,0,1,0,0,1,0,0,0.0


#### Prediction: ####
I think that linear regression will perform better than a random forest classifier because random forest usually doesn't do well with unscaled data and a large number of features.


In [43]:
# Train the Logistic Regression model on the unscaled data and print the model score
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

classifier = LogisticRegression(max_iter=50000)
classifier

classifier.fit(X_train, y_train)

print(f"Training dataset score: {classifier.score(X_train, y_train)}")
print(f"Testing dataset score: {classifier.score(X_test, y_test)}")


Training dataset score: 0.7003284072249589
Testing dataset score: 0.5644406635474266


In [44]:
# Train a Random Forest Classifier model and print the model score
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier(n_estimators=250, random_state=1)
clf.fit(X_train, y_train)

print(f"Training dataset score: {clf.score(X_train, y_train)}")
print(f"Testing dataset score: {clf.score(X_test, y_test)}")

Training dataset score: 1.0
Testing dataset score: 0.6682262866865164


In [45]:
# Scale the data
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler().fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)


#### Prediction: ####
I think that with the data being scaled, both models will improve their performance.

In [48]:
# Train the Logistic Regression model on the scaled data and print the model score
classifier = LogisticRegression(max_iter=50000)
# classifier
classifier.fit(X_train_scaled, y_train)

print(f"Training dataset score: {classifier.score(X_train_scaled, y_train)}")
print(f"Testing dataset score: {classifier.score(X_test_scaled, y_test)}")

Training dataset score: 0.712807881773399
Testing dataset score: 0.7203317737133135


In [49]:
# Train a Random Forest Classifier model on the scaled data and print the model score
clf = RandomForestClassifier(n_estimators=250, random_state=1)
clf.fit(X_train_scaled, y_train)

print(f"Training dataset score: {clf.score(X_train_scaled, y_train)}")
print(f"Testing dataset score: {clf.score(X_test_scaled, y_test)}")


Training dataset score: 1.0
Testing dataset score: 0.6692896639727776


### Conclusion: ###

Working with scaled data, logistic regression performed better than before. On the other hand, the random forest did not perform any better or worse. Logistic regression performing better again fits my prediction, but I was wrong about the random forest improving.