**Thinkful - 4.3.6 - Challenge - Make Your Network**

In [5]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import time
import pydotplus
from sklearn import tree
from IPython.display import Image
from sklearn import ensemble
from sklearn.feature_selection import SelectKBest
from sklearn.model_selection import cross_val_score
from sklearn.neural_network import MLPClassifier
%matplotlib inline

# Replace the path with the correct path for your data.
y2015 = pd.read_csv('data/LoanStats3d.csv',skipinitialspace=True,header=1)

# Remove two summary rows at the end that don't actually contain data.
y2015 = y2015[:-2]

# Drop other columns with many unique variables
y2015['id'] = pd.to_numeric(y2015['id'], errors='coerce')
y2015['int_rate'] = pd.to_numeric(y2015['int_rate'].str.strip('%'), errors='coerce')
y2015.drop(['url', 'emp_title', 'zip_code', 'earliest_cr_line', 'revol_util',
            'sub_grade', 'addr_state', 'desc', 'verification_status_joint',
           'policy_code'], 1,inplace=True)

# Create dataset for use in models
X = y2015.drop(['loan_status'], 1)
y = y2015['loan_status']
X = pd.get_dummies(X)
X = X.dropna(axis=1)

  interactivity=interactivity, compiler=compiler, result=result)


In [6]:
y.value_counts()

Current               287414
Fully Paid             87989
Charged Off            29178
Late (31-120 days)      9510
In Grace Period         4320
Late (16-30 days)       1888
Default                  796
Name: loan_status, dtype: int64

In [11]:
y2015.shape

(421095, 101)

# Decision Tree

In [8]:
# Initialize and train our tree.
status_options = ['Current','Fully Paid','Charged Off','Late (31-120 days)','In Grace Period',
                 'Late (16-30 days)','Default']

start_time = time.time()

# Build Model
decision_tree = tree.DecisionTreeClassifier(criterion='entropy',max_features=9,max_depth=6)
dtree = decision_tree.fit(X,y)
model_time = time.time()
a = (model_time - start_time)
print("Model Time: %s seconds" % a)

#Evaluate Model
#X_new = rfc_kbest.transform(X)
score = cross_val_score(dtree, X, y, cv=5)
print(score)
print("Weighted Accuracy: %0.2f (+/- %0.2f)" % (score.mean(), score.std() * 2))

cross_val_time = time.time()
b = (cross_val_time - model_time)
print("Cross Val Time: %s seconds" % b)

Model Time: 1.5377931594848633 seconds
[ 0.90376509  0.90511755  0.93944431  0.92501603  0.92091764]
Weighted Accuracy: 0.92 (+/- 0.03)
Cross Val Time: 9.442087888717651 seconds


In [9]:
X = X.drop('pymnt_plan_n',1)

# Random Forest

**All Estimators**

In [12]:
start_time = time.time()

# Build Model
rfc = ensemble.RandomForestClassifier()
kbest = SelectKBest(k=3).fit(X,y)
model_time = time.time()
a = (model_time - start_time)
print("Model Time: %s seconds" % a)

#Evaluate Model
X_new = kbest.transform(X)
score = cross_val_score(rfc, X_new, y, cv=5)
cross_val_time = time.time()
b = (cross_val_time - model_time)
print("Cross Val Time: %s seconds" % b)
print(score)
print("Weighted Accuracy: %0.2f (+/- %0.2f)" % (score.mean(), score.std() * 2))

Model Time: 15.672481060028076 seconds
Cross Val Time: 157.0256631374359 seconds
[ 0.91094963  0.91109211  0.91149581  0.93236921  0.9080266   0.91747803
  0.92246313  0.91118288  0.90811504  0.91269653]
Weighted Accuracy: 0.91 (+/- 0.01)


**20 Estimators**

In [5]:
start_time = time.time()

# Build Model
rfc = ensemble.RandomForestClassifier(n_estimators=20)
kbest = SelectKBest(k=3).fit(X,y)
model_time = time.time()
a = (model_time - start_time)
print("Model Time: %s seconds" % a)

#Evaluate Model
X_new = kbest.transform(X)
score = cross_val_score(rfc, X_new, y, cv=5)
cross_val_time = time.time()
b = (cross_val_time - model_time)
print("Cross Val Time: %s seconds" % b)
print(score)
print("Weighted Accuracy: %0.2f (+/- %0.2f)" % (score.mean(), score.std() * 2))

Model Time: 19.266163110733032 seconds
Cross Val Time: 550.0227119922638 seconds
[ 0.91111586  0.91104462  0.91092589  0.93296288  0.90790786  0.91707433
  0.92272436  0.91177658  0.90711758  0.91141405]
Weighted Accuracy: 0.91 (+/- 0.02)


**5 Estimators**

In [6]:
start_time = time.time()

# Build Model
rfc = ensemble.RandomForestClassifier(n_estimators=5)
kbest = SelectKBest(k=3).fit(X,y)
model_time = time.time()
a = (model_time - start_time)
print("Model Time: %s seconds" % a)

#Evaluate Model
X_new = kbest.transform(X)
score = cross_val_score(rfc, X_new, y, cv=5)
cross_val_time = time.time()
b = (cross_val_time - model_time)
print("Cross Val Time: %s seconds" % b)
print(score)
print("Weighted Accuracy: %0.2f (+/- %0.2f)" % (score.mean(), score.std() * 2))

Model Time: 41.46573996543884 seconds
Cross Val Time: 128.81869220733643 seconds
[ 0.91130583  0.90961981  0.91075966  0.93263043  0.90717169  0.91593446
  0.92251063  0.91108789  0.90697509  0.9112478 ]
Weighted Accuracy: 0.91 (+/- 0.02)


# Multi-Layer Perceptron

**One Layer, 1000 layers wide**

In [16]:
start_time = time.time()

# Establish and fit the model
mlp = MLPClassifier(hidden_layer_sizes=(200,))
mlp.fit(X, y)
model_time = time.time()
score1 = mlp.score(X, y)
print("Score = ",score1)
a = (model_time - start_time)
print("Model Time: %s seconds" % a)

score2 = cross_val_score(mlp, X, y, cv=5)
cross_val_time = time.time()
b = (cross_val_time - model_time)
print("Cross Val Time: %s seconds" % b)
print(score2)
print("Weighted Accuracy: %0.2f (+/- %0.2f)" % (score2.mean(), score2.std() * 2))

Score =  0.682955152638
Model Time: 256.7434058189392 seconds
Cross Val Time: 725.9790360927582 seconds
[ 0.68273946  0.24042983  0.88420803  0.6833931   0.68511922]
Weighted Accuracy: 0.64 (+/- 0.42)


In [17]:
start_time = time.time()

# Establish and fit the model
mlp = MLPClassifier(hidden_layer_sizes=(50,20))
mlp.fit(X, y)
model_time = time.time()
score1 = mlp.score(X, y)
print("Score = ",score1)
a = (model_time - start_time)
print("Model Time: %s seconds" % a)

score2 = cross_val_score(mlp, X, y, cv=5)
cross_val_time = time.time()
b = (cross_val_time - model_time)
print("Cross Val Time: %s seconds" % b)
print(score2)
print("Weighted Accuracy: %0.2f (+/- %0.2f)" % (score2.mean(), score2.std() * 2))

Score =  0.682544318978
Model Time: 52.61250305175781 seconds
Cross Val Time: 334.90957593917847 seconds
[ 0.68260885  0.68259321  0.68261696  0.68260942  0.68256626]
Weighted Accuracy: 0.68 (+/- 0.00)


In [18]:
start_time = time.time()

# Establish and fit the model
mlp = MLPClassifier(hidden_layer_sizes=(20,10,5))
mlp.fit(X, y)
model_time = time.time()
score1 = mlp.score(X, y)
print("Score = ",score1)
a = (model_time - start_time)
print("Model Time: %s seconds" % a)

score2 = cross_val_score(mlp, X, y, cv=5)
cross_val_time = time.time()
b = (cross_val_time - model_time)
print("Cross Val Time: %s seconds" % b)
print(score2)
print("Weighted Accuracy: %0.2f (+/- %0.2f)" % (score2.mean(), score2.std() * 2))

Score =  0.682539569456
Model Time: 23.75485396385193 seconds
Cross Val Time: 99.53593301773071 seconds
[ 0.68252574  0.68253384  0.68267632  0.6825263   0.68254251]
Weighted Accuracy: 0.68 (+/- 0.00)


# Conclusions

The random forest models ran faster than the MLP models, except in the case where there were fewer neurons in the MLP layers. The random forest models were also more accurate overall and had more consistent cross validation scores.