In [2]:
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler

In [3]:
train_df = pd.read_csv(Path('Resources/2019loans.csv'))
test_df = pd.read_csv(Path('Resources/2020Q1loans.csv'))

In [4]:
train_df

Unnamed: 0.1,Unnamed: 0,index,loan_amnt,int_rate,installment,home_ownership,annual_inc,verification_status,loan_status,pymnt_plan,...,pct_tl_nvr_dlq,percent_bc_gt_75,pub_rec_bankruptcies,tax_liens,tot_hi_cred_lim,total_bal_ex_mort,total_bc_limit,total_il_high_credit_limit,hardship_flag,debt_settlement_flag
0,57107,57107,13375.0,0.1797,483.34,MORTGAGE,223000.0,Not Verified,low_risk,n,...,100.0,50.0,0.0,0.0,577150.0,122018.0,32000.0,170200.0,N,N
1,141451,141451,21000.0,0.1308,478.68,MORTGAGE,123000.0,Source Verified,low_risk,n,...,85.0,33.3,0.0,0.0,132750.0,27896.0,15900.0,35398.0,N,N
2,321143,321143,20000.0,0.1240,448.95,MORTGAGE,197000.0,Source Verified,low_risk,n,...,85.7,33.3,0.0,0.0,628160.0,114043.0,22600.0,90340.0,N,N
3,11778,11778,3000.0,0.1240,100.22,RENT,45000.0,Not Verified,low_risk,n,...,100.0,16.7,1.0,0.0,42006.0,20761.0,19900.0,15406.0,N,N
4,169382,169382,30000.0,0.1612,1056.49,MORTGAGE,133000.0,Source Verified,low_risk,n,...,100.0,66.7,0.0,0.0,283248.0,109056.0,79500.0,58778.0,N,N
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12175,354912,354912,19975.0,0.2565,801.09,RENT,28000.0,Not Verified,high_risk,n,...,100.0,16.7,0.0,0.0,50055.0,28192.0,18700.0,19055.0,N,N
12176,354944,354944,15000.0,0.1774,540.34,RENT,50000.0,Verified,high_risk,n,...,90.5,11.1,0.0,0.0,70324.0,57025.0,13300.0,54824.0,N,N
12177,354973,354973,3600.0,0.1862,131.28,RENT,60000.0,Not Verified,high_risk,n,...,100.0,0.0,0.0,0.0,83765.0,55156.0,14800.0,53065.0,N,N
12178,355002,355002,15000.0,0.0881,475.68,MORTGAGE,62000.0,Source Verified,high_risk,n,...,100.0,0.0,0.0,0.0,189930.0,23748.0,7000.0,32930.0,N,N


In [5]:
test_df

Unnamed: 0.1,Unnamed: 0,index,loan_amnt,int_rate,installment,home_ownership,annual_inc,verification_status,loan_status,pymnt_plan,...,pct_tl_nvr_dlq,percent_bc_gt_75,pub_rec_bankruptcies,tax_liens,tot_hi_cred_lim,total_bal_ex_mort,total_bc_limit,total_il_high_credit_limit,hardship_flag,debt_settlement_flag
0,67991,67991,40000.0,0.0819,814.70,MORTGAGE,140000.0,Not Verified,low_risk,n,...,97.7,0.0,0.0,0.0,527975.0,70914.0,74600.0,99475.0,N,N
1,25429,25429,6000.0,0.1524,208.70,RENT,55000.0,Not Verified,low_risk,n,...,66.7,0.0,0.0,0.0,34628.0,23460.0,5900.0,23628.0,N,N
2,38496,38496,3600.0,0.1695,128.27,RENT,42000.0,Not Verified,low_risk,n,...,100.0,0.0,0.0,0.0,23100.0,19183.0,7300.0,15000.0,N,N
3,19667,19667,20000.0,0.1524,478.33,RENT,100000.0,Not Verified,low_risk,n,...,100.0,50.0,0.0,0.0,56481.0,43817.0,13800.0,35981.0,N,N
4,37505,37505,3600.0,0.1240,120.27,RENT,50000.0,Not Verified,low_risk,n,...,100.0,25.0,0.0,0.0,45977.0,32448.0,21000.0,24977.0,N,N
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4697,77282,77282,30000.0,0.1240,673.42,RENT,140480.0,Source Verified,high_risk,n,...,100.0,28.6,0.0,0.0,159688.0,110873.0,48400.0,107388.0,N,N
4698,77291,77291,24000.0,0.0756,747.22,RENT,50000.0,Not Verified,high_risk,n,...,100.0,0.0,0.0,0.0,62375.0,18928.0,13300.0,30775.0,N,N
4699,77292,77292,10000.0,0.2305,387.36,RENT,33000.0,Verified,high_risk,n,...,100.0,0.0,0.0,0.0,43250.0,33022.0,8500.0,29550.0,N,N
4700,77297,77297,8000.0,0.1862,205.86,RENT,38000.0,Source Verified,high_risk,n,...,95.0,0.0,1.0,0.0,31357.0,19595.0,1500.0,9657.0,N,N


In [6]:
set(train_df["loan_status"])

{'high_risk', 'low_risk'}

In [7]:
set(test_df["loan_status"])

{'high_risk', 'low_risk'}

In [12]:
# Convert categorical data to numeric and separate target label for training data
# use pd.get_dummies to convert categorial data to numeric
dummies_train_df = pd.get_dummies(train_df, prefix=None, prefix_sep='_', dummy_na=False, columns=None, sparse=False, drop_first=False, dtype=None)

In [18]:
dummies_test_df = pd.get_dummies(test_df, prefix=None, prefix_sep='_', dummy_na=False, columns=None, sparse=False, drop_first=False, dtype=None)


In [19]:
dummies_train_df.columns.shape


(96,)

In [20]:
dummies_test_df.columns.shape

(95,)

In [21]:
#see the difference between the two dummies df

set(dummies_train_df.columns) - set (dummies_test_df.columns)



{'debt_settlement_flag_Y'}

In [22]:
#drop {'debt_settlement_flag_Y'} from dummies_train_df.columns

dummies_train_df = dummies_train_df.drop(columns = ['debt_settlement_flag_Y'])

In [23]:
train_df.columns

Index(['Unnamed: 0', 'index', 'loan_amnt', 'int_rate', 'installment',
       'home_ownership', 'annual_inc', 'verification_status', 'loan_status',
       'pymnt_plan', 'dti', 'delinq_2yrs', 'inq_last_6mths', 'open_acc',
       'pub_rec', 'revol_bal', 'total_acc', 'initial_list_status', 'out_prncp',
       'out_prncp_inv', 'total_pymnt', 'total_pymnt_inv', 'total_rec_prncp',
       'total_rec_int', 'total_rec_late_fee', 'recoveries',
       'collection_recovery_fee', 'last_pymnt_amnt',
       'collections_12_mths_ex_med', 'policy_code', 'application_type',
       'acc_now_delinq', 'tot_coll_amt', 'tot_cur_bal', 'open_acc_6m',
       'open_act_il', 'open_il_12m', 'open_il_24m', 'mths_since_rcnt_il',
       'total_bal_il', 'il_util', 'open_rv_12m', 'open_rv_24m', 'max_bal_bc',
       'all_util', 'total_rev_hi_lim', 'inq_fi', 'total_cu_tl', 'inq_last_12m',
       'acc_open_past_24mths', 'avg_cur_bal', 'bc_open_to_buy', 'bc_util',
       'chargeoff_within_12_mths', 'delinq_amnt', 'mo_sin_ol

In [24]:
train_df.columns.size

86

In [25]:
#alternative methods for conversion (alternate method)
#changing strings to category type
train_df[['home_ownership','verification_status']] = train_df[['home_ownership','verification_status']].astype("category")

In [26]:
#then, change category to numerical values (alternate method)
train_df['home_ownership'] = train_df['home_ownership'].astype("category").cat.codes
train_df

Unnamed: 0.1,Unnamed: 0,index,loan_amnt,int_rate,installment,home_ownership,annual_inc,verification_status,loan_status,pymnt_plan,...,pct_tl_nvr_dlq,percent_bc_gt_75,pub_rec_bankruptcies,tax_liens,tot_hi_cred_lim,total_bal_ex_mort,total_bc_limit,total_il_high_credit_limit,hardship_flag,debt_settlement_flag
0,57107,57107,13375.0,0.1797,483.34,1,223000.0,Not Verified,low_risk,n,...,100.0,50.0,0.0,0.0,577150.0,122018.0,32000.0,170200.0,N,N
1,141451,141451,21000.0,0.1308,478.68,1,123000.0,Source Verified,low_risk,n,...,85.0,33.3,0.0,0.0,132750.0,27896.0,15900.0,35398.0,N,N
2,321143,321143,20000.0,0.1240,448.95,1,197000.0,Source Verified,low_risk,n,...,85.7,33.3,0.0,0.0,628160.0,114043.0,22600.0,90340.0,N,N
3,11778,11778,3000.0,0.1240,100.22,3,45000.0,Not Verified,low_risk,n,...,100.0,16.7,1.0,0.0,42006.0,20761.0,19900.0,15406.0,N,N
4,169382,169382,30000.0,0.1612,1056.49,1,133000.0,Source Verified,low_risk,n,...,100.0,66.7,0.0,0.0,283248.0,109056.0,79500.0,58778.0,N,N
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12175,354912,354912,19975.0,0.2565,801.09,3,28000.0,Not Verified,high_risk,n,...,100.0,16.7,0.0,0.0,50055.0,28192.0,18700.0,19055.0,N,N
12176,354944,354944,15000.0,0.1774,540.34,3,50000.0,Verified,high_risk,n,...,90.5,11.1,0.0,0.0,70324.0,57025.0,13300.0,54824.0,N,N
12177,354973,354973,3600.0,0.1862,131.28,3,60000.0,Not Verified,high_risk,n,...,100.0,0.0,0.0,0.0,83765.0,55156.0,14800.0,53065.0,N,N
12178,355002,355002,15000.0,0.0881,475.68,1,62000.0,Source Verified,high_risk,n,...,100.0,0.0,0.0,0.0,189930.0,23748.0,7000.0,32930.0,N,N


In [27]:
# SEPARATE TARGET LABEL for TRAINING DATA
# X are the factors, Y is the prediction

X_train = dummies_train_df.drop(columns = ["loan_status_low_risk", "loan_status_high_risk"])
y_train = dummies_train_df["loan_status_high_risk"]



In [117]:
y_train

0        0
1        0
2        0
3        0
4        0
        ..
12175    1
12176    1
12177    1
12178    1
12179    1
Name: loan_status_high_risk, Length: 12180, dtype: uint8

In [118]:
# X_train["loan_status_high_risk"]

In [119]:
# Convert categorical data to numeric and separate target label for testing data
dummies_test_df = pd.get_dummies(test_df, prefix=None, prefix_sep='_', dummy_na=False, columns=None, sparse=False, drop_first=False, dtype=None)
dummies_test_df

Unnamed: 0.1,Unnamed: 0,index,loan_amnt,int_rate,installment,annual_inc,dti,delinq_2yrs,inq_last_6mths,open_acc,...,loan_status_high_risk,loan_status_low_risk,pymnt_plan_n,initial_list_status_f,initial_list_status_w,application_type_Individual,application_type_Joint App,hardship_flag_N,hardship_flag_Y,debt_settlement_flag_N
0,67991,67991,40000.0,0.0819,814.70,140000.0,19.75,0.0,1.0,18.0,...,0,1,1,0,1,1,0,1,0,1
1,25429,25429,6000.0,0.1524,208.70,55000.0,11.52,2.0,0.0,8.0,...,0,1,1,0,1,1,0,1,0,1
2,38496,38496,3600.0,0.1695,128.27,42000.0,6.74,0.0,0.0,6.0,...,0,1,1,0,1,1,0,1,0,1
3,19667,19667,20000.0,0.1524,478.33,100000.0,12.13,0.0,2.0,7.0,...,0,1,1,0,1,1,0,1,0,1
4,37505,37505,3600.0,0.1240,120.27,50000.0,16.08,0.0,3.0,6.0,...,0,1,1,0,1,1,0,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4697,77282,77282,30000.0,0.1240,673.42,140480.0,15.74,0.0,0.0,20.0,...,1,0,1,1,0,1,0,1,0,1
4698,77291,77291,24000.0,0.0756,747.22,50000.0,26.81,0.0,0.0,9.0,...,1,0,1,0,1,1,0,1,0,1
4699,77292,77292,10000.0,0.2305,387.36,33000.0,38.51,0.0,2.0,7.0,...,1,0,1,1,0,1,0,1,0,1
4700,77297,77297,8000.0,0.1862,205.86,38000.0,16.36,0.0,1.0,8.0,...,1,0,1,0,1,1,0,1,0,1


In [129]:
# SEPARATE TARGET LABEL for TESTING DATA
# X are the factors, Y is the prediction

X_test = dummies_test_df.drop(columns = ["loan_status_low_risk","loan_status_high_risk"])
y_test = dummies_test_df["loan_status_low_risk"]


In [130]:
#Random Forest Model

# X is the input values, y is the output values
model = RandomForestClassifier()
model.fit(X_train,y_train)

#insert the X_test values to get a prediction
predictions = model.predict(X_test)

#to see how accurate the model is, we simply compare the "predictions for output test values", with the "y_test values".
score = accuracy_score(y_test, predictions)
score

0.3913228413441089

In [131]:
#Logistic Regression Classifier

# X is the input values, y is the output values
model = LogisticRegression()
model.fit(X_train,y_train)

#insert the X_test values to get a prediction
predictions = model.predict(X_test)

#to see how accurate the model is, we simply compare the "predictions for output test values", with the "y_test values".
score = accuracy_score(y_test, predictions)
score



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.4746916205869843

In [132]:
# standardize the features with a scaler

scaler = StandardScaler()
scaler.fit(X_train)

# transform the data, fit it into the new space

X_train_scaled = scaler.transform(X_train)
X_train_scaled.shape




(12180, 93)

In [133]:
X_test_scaled = scaler.transform(X_test)
X_test_scaled

array([[-1.20255948, -1.20255948,  2.20755943, ...,  0.17149859,
        -0.17149859,  0.02026518],
       [-1.62943343, -1.62943343, -1.11348584, ...,  0.17149859,
        -0.17149859,  0.02026518],
       [-1.49837845, -1.49837845, -1.34791257, ...,  0.17149859,
        -0.17149859,  0.02026518],
       ...,
       [-1.10927546, -1.10927546, -0.72277464, ...,  0.17149859,
        -0.17149859,  0.02026518],
       [-1.10922531, -1.10922531, -0.91813024, ...,  0.17149859,
        -0.17149859,  0.02026518],
       [-1.1091551 , -1.1091551 ,  1.23078141, ...,  0.17149859,
        -0.17149859,  0.02026518]])

In [134]:
#Random Forest Model (scaled)

# X is the input values, y is the output values
model = RandomForestClassifier()
model.fit(X_train_scaled,y_train)

#insert the X_test values to get a prediction
predictions = model.predict(X_test_scaled)

#to see how accurate the model is, we simply compare the "predictions for output test values", with the "y_test values".
score = accuracy_score(y_test, predictions)
score

0.41280306252658444

In [135]:
#Logistic Regression Classifier (Scaled)

# X is the input values, y is the output values
model = LogisticRegression()
model.fit(X_train_scaled,y_train)

#insert the X_test values to get a prediction
predictions = model.predict(X_test_scaled)

#to see how accurate the model is, we simply compare the "predictions for output test values", with the "y_test values".
score = accuracy_score(y_test, predictions)
score


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.2781794980859209

In [None]:
# add missing dummy variables to testing set

In [None]:
# Train the Logistic Regression model on the unscaled data and print the model score

In [None]:
# Train a Random Forest Classifier model and print the model score

In [None]:
# Scale the data

In [None]:
# Train the Logistic Regression model on the scaled data and print the model score

In [None]:
# Train a Random Forest Classifier model on the scaled data and print the model score