In [46]:
# import dependencies
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

### Data Preparation

In [5]:
# import training and testing data
train_df = pd.read_csv(Path('Resources/2019loans.csv'))
test_df = pd.read_csv(Path('Resources/2020Q1loans.csv'))

# preview the training data
train_df

Unnamed: 0,loan_amnt,int_rate,installment,home_ownership,annual_inc,verification_status,pymnt_plan,dti,delinq_2yrs,inq_last_6mths,...,percent_bc_gt_75,pub_rec_bankruptcies,tax_liens,tot_hi_cred_lim,total_bal_ex_mort,total_bc_limit,total_il_high_credit_limit,hardship_flag,debt_settlement_flag,target
0,7000.0,0.1894,256.38,MORTGAGE,75000.0,Not Verified,n,28.62,0.0,2.0,...,87.5,0.0,0.0,352260.0,62666.0,35000.0,10000.0,N,N,low_risk
1,40000.0,0.1614,975.71,MORTGAGE,102000.0,Source Verified,n,11.72,2.0,0.0,...,0.0,0.0,0.0,294664.0,109911.0,9000.0,71044.0,N,N,low_risk
2,11000.0,0.2055,294.81,RENT,45000.0,Verified,n,37.25,1.0,3.0,...,7.7,0.0,0.0,92228.0,36007.0,33000.0,46328.0,N,N,low_risk
3,4000.0,0.1612,140.87,MORTGAGE,38000.0,Not Verified,n,42.89,1.0,0.0,...,100.0,0.0,0.0,284273.0,52236.0,13500.0,52017.0,N,N,low_risk
4,14000.0,0.1797,505.93,MORTGAGE,43000.0,Source Verified,n,22.16,1.0,0.0,...,25.0,0.0,0.0,120280.0,88147.0,33300.0,78680.0,N,N,low_risk
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12175,19975.0,0.2565,801.09,RENT,28000.0,Not Verified,n,28.42,0.0,0.0,...,16.7,0.0,0.0,50055.0,28192.0,18700.0,19055.0,N,N,high_risk
12176,15000.0,0.1774,540.34,RENT,50000.0,Verified,n,23.43,4.0,0.0,...,11.1,0.0,0.0,70324.0,57025.0,13300.0,54824.0,N,N,high_risk
12177,3600.0,0.1862,131.28,RENT,60000.0,Not Verified,n,28.80,0.0,1.0,...,0.0,0.0,0.0,83765.0,55156.0,14800.0,53065.0,N,N,high_risk
12178,15000.0,0.0881,475.68,MORTGAGE,62000.0,Source Verified,n,11.44,0.0,0.0,...,0.0,0.0,0.0,189930.0,23748.0,7000.0,32930.0,N,N,high_risk


In [10]:
# preview the testing data
test_df

Unnamed: 0,loan_amnt,int_rate,installment,home_ownership,annual_inc,verification_status,pymnt_plan,dti,delinq_2yrs,inq_last_6mths,...,percent_bc_gt_75,pub_rec_bankruptcies,tax_liens,tot_hi_cred_lim,total_bal_ex_mort,total_bc_limit,total_il_high_credit_limit,hardship_flag,debt_settlement_flag,target
0,40000.0,0.1033,856.40,RENT,128700.0,Source Verified,n,12.47,0.0,1.0,...,57.1,0.0,0.0,63915.0,49510.0,49400.0,14515.0,Y,N,low_risk
1,24450.0,0.1430,572.72,MORTGAGE,44574.0,Not Verified,n,15.05,0.0,1.0,...,0.0,0.0,0.0,136425.0,19439.0,15500.0,18925.0,N,N,low_risk
2,13500.0,0.1430,316.23,OWN,60000.0,Not Verified,n,28.72,0.0,0.0,...,0.0,0.0,0.0,82124.0,65000.0,5400.0,61724.0,Y,N,low_risk
3,10625.0,0.1774,268.31,RENT,60000.0,Verified,n,15.70,0.0,4.0,...,20.0,0.0,0.0,54855.0,50335.0,23200.0,26255.0,N,N,low_risk
4,6375.0,0.1862,232.46,RENT,60000.0,Source Verified,n,35.50,0.0,0.0,...,75.0,0.0,0.0,90445.0,56541.0,15300.0,72345.0,N,N,low_risk
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4697,30000.0,0.1240,673.42,RENT,140480.0,Source Verified,n,15.74,0.0,0.0,...,28.6,0.0,0.0,159688.0,110873.0,48400.0,107388.0,N,N,high_risk
4698,24000.0,0.0756,747.22,RENT,50000.0,Not Verified,n,26.81,0.0,0.0,...,0.0,0.0,0.0,62375.0,18928.0,13300.0,30775.0,N,N,high_risk
4699,10000.0,0.2305,387.36,RENT,33000.0,Verified,n,38.51,0.0,2.0,...,0.0,0.0,0.0,43250.0,33022.0,8500.0,29550.0,N,N,high_risk
4700,8000.0,0.1862,205.86,RENT,38000.0,Source Verified,n,16.36,0.0,1.0,...,0.0,1.0,0.0,31357.0,19595.0,1500.0,9657.0,N,N,high_risk


In [13]:
# compare columns of testing and training data
listA = train_df.columns
listB = test_df.columns
len(list(set(listA) & set(listB)))

84

The testing and training data, at least in raw form, have the same number of columns as the intersection between their column-lists. Ergo, both have the same set of columns.

In [22]:
# Convert categorical data to numeric and separate target feature for training data
# first split training set into X and y
X_train = train_df.drop('target', axis=1)
y_train = train_df['target']

# then convert categorical X columns to numeric with pd.get_dummies()
X_train_d = pd.get_dummies(X_train)

# convert y categorical values to numeric, here being low-risk =>1 and high-risk =>0
y_train_n = LabelEncoder().fit_transform(y_train)

In [24]:
# Convert categorical data to numeric and separate target feature for testing data
# first split testing set into X and y
X_test = test_df.drop('target', axis=1)
y_test = test_df['target']

# then convert categorical X columns to numeric with pd.get_dummies()
X_test_d = pd.get_dummies(X_test)

# convert y categorical values to numeric, here being low-risk =>1 and high-risk =>0
y_test_n = LabelEncoder().fit_transform(y_test)

In [26]:
# check difference in number of columns for dummied data
listC = X_train_d.columns
listD = X_test_d.columns
print(f"number of columns in dummied training data: {len(listC)}")
print(f"number of columns in dummied testing data: {len(listD)}")
print(f"number of intersections between column lists: {len(list(set(listC) & set(listD)))}")

number of columns in dummied training data: 92
number of columns in dummied testing data: 91
number of intersections between column lists: 91


One column missing from testing data, have to add it in.

In [69]:
# determine which column is missing from the dummied testing data
missing_columns = set(listC).symmetric_difference(listD)
missing_columns

{'debt_settlement_flag_Y'}

In [70]:
# add any missing dummy variables to testing set
for col in missing_columns:
    X_test_d[col] = 0

# reorder the columns to match the training data to avoid any potential errors
cols = list(X_train_d.columns.values)
X_test_d = X_test_d[cols]

### Pre-Modeling Expectations 
I expect the RFC model to perform better than the logistic regression model with this dataset. From what I've been able to look up, RFC generally performs better on datasets with more categorical fields.
<br>
Since the datasets have multiple categorical flags, it is unlikely for there to be any kind of strong linear relationship for which a logistic regression might perform better.

### Unscaled Data - Model Generation and Testing

In [60]:
# Train the Logistic Regression model on the unscaled data and print the model score
# initialize the logistic regression model, high number of iterations needed to avoid nonconvergence error
log_unscaled = LogisticRegression(max_iter=7000)

# fit the model to the training data
log_unscaled.fit(X_train_d, y_train_n)

# check the model score
print(f"Training Data Score: {log_unscaled.score(X_train_d, y_train_n)}")
print(f"Testing Data Score: {log_unscaled.score(X_test_d, y_test_n)}")

Training Data Score: 0.702216748768473
Testing Data Score: 0.5655040408336878


In [61]:
# Train a Random Forest Classifier model and print the model score
# initialize the RFC model
rfc_unscaled = RandomForestClassifier(random_state=1, n_estimators=500)

# fit the model to the training data
rfc_unscaled.fit(X_train_d, y_train_n)

# check the model score
print(f"Training Data Score: {rfc_unscaled.score(X_train_d, y_train_n)}")
print(f"Testing Data Score: {rfc_unscaled.score(X_test_d, y_test_n)}")

Training Data Score: 1.0
Testing Data Score: 0.646958740961293


### Unscaled Data - Model Analysis
For the unscaled data, Random Forest Classification is clearly outperforming the Logistic Regression model, as expected.
<br>
While the RFC model's training score was a perfect 1, the logistic regression only managed just over .7 and while the scores for the testing data were closer, the RFC still beat the logistic by a large margin.
<br>
This fits with my expectation prior to running the models, most likely due to the large number of categorical values not fitting well with a linear model.
<br>
The relatively low testing data scores for both models are likely due to the large magnitude of the numeric values in the datasets.

### Scaled Data - Data Rescaling and Refitting Models

In [63]:
# create copies of X's
X_train_scaled = X_train_d.copy()
X_test_scaled = X_test_d.copy()

# Scale the non-categorical data: first copy X's and pull list of non-categorical columns
numCols = train_df.select_dtypes(include=np.number).columns.tolist()

# pull out numeric features to be scaled by calling columns named in numCols
unscaled_train_features = X_train_scaled[numCols]
unscaled_test_features = X_test_scaled[numCols]

# generate and fit the scaler to the numeric features in the training data
scaler = StandardScaler().fit(unscaled_train_features.values)

# transform the numeric features of each dataset
train_features = scaler.transform(unscaled_train_features.values)
test_features = scaler.transform(unscaled_test_features.values)

# apply the scaled values to the X's
X_train_scaled[numCols] = train_features
X_test_scaled[numCols] = test_features

In [64]:
# Train the Logistic Regression model on the scaled data and print the model score
# initialize the logistic regression model, high number of iterations needed to avoid nonconvergence error
log_scaled = LogisticRegression(max_iter=7000)

# fit the model to the training data
log_scaled.fit(X_train_scaled, y_train_n)

# check the model score
print(f"Training Data Score: {log_scaled.score(X_train_scaled, y_train_n)}")
print(f"Testing Data Score: {log_scaled.score(X_test_scaled, y_test_n)}")

Training Data Score: 0.7108374384236453
Testing Data Score: 0.7579753296469588


In [65]:
# Train a Random Forest Classifier model on the scaled data and print the model score
# initialize the RFC model
rfc_scaled = RandomForestClassifier(random_state=1, n_estimators=500)

# fit the model to the training data
rfc_scaled.fit(X_train_scaled, y_train_n)

# check the model score
print(f"Training Data Score: {rfc_scaled.score(X_train_scaled, y_train_n)}")
print(f"Testing Data Score: {rfc_scaled.score(X_test_scaled, y_test_n)}")

Training Data Score: 1.0
Testing Data Score: 0.6480221182475542


### Scaled Data - Model Analysis

After applying a standard scale to all numeric columns in the training and testing datasets, the Logistic Regression had significantly improved results while the Random Forest Classifier showed only a very slight improvement.
<br>
The logisitic model's testing score improved from .57 to .76, making for a far superior match to the experimental values recorded. On the other hand, scaling the data had virtually no impact on the results of the RFC, with the score going from .647 to .648.
<br>
This shows that my initial prediction for which model better fit the dataset was completely wrong. My error likely came from the preview I looked at at the beginning preferencing the categorical columns, giving me a false impression of their weight. Looking back, they make up only 8 out of 84 original columns.
<br>
Therefor, it makes sense that once errors from lack of scaling were removed, the Logistic Regression model performed better than the RFC.
<br> The lacking improvement for the RFC with scaled data, compared to the logistic improvements, should be down to the fact that, according to what I've read online, since RFC is not distance-based, scaling has very little impact.