# License 
***
Copyright (C) 2017 -- 2022 J. Patrick Hall, jphall@gwu.edu

Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

## Installs and Imports

1. Standard Python imports

In [None]:
import pandas as pd # import pandas for easy data manipulation using data frames

from matplotlib import pyplot as plt # plotting
import numpy as np                   # basic array and matric handling

from sklearn import tree                                  # sklearn tree model for decision trees
from sklearn.model_selection import train_test_split      # for partitioning data
from sklearn.model_selection import cross_val_score       # for cross validation
from sklearn.metrics import roc_auc_score, accuracy_score # to assess decision tree perforamce

# to upload local files
import io
from google.colab import files  

SEED = 12345 # seed for better reproducibility

## Load example data

2. Upload class example data

In [None]:
# special google collab command to upload a file from computer
uploaded = files.upload() # REQUIRES STUDENT INPUT

In [None]:
# 3
uploaded.keys() # what is stored in that Python object?

4. Covert to Pandas DataFrame

In [None]:
# convert data to Pandas DataFrame
raw = pd.read_csv(io.StringIO(uploaded['loan_clean.csv'].decode('utf-8'))) # name in quotes here must match name in quotes directly above 

## Split training data into three partitions for improved model selection"

5. Set modeling roles based on feature names

In [None]:
x_names = ['GRP_REP_home_ownership', 'GRP_addr_state', 'GRP_purpose', 'GRP_verification_status', 
           'STD_IMP_REP_annual_inc',	'STD_IMP_REP_delinq_2yrs',	 'STD_IMP_REP_dti', 
           'STD_IMP_REP_emp_length', 'STD_IMP_REP_int_rate', 'STD_IMP_REP_loan_amnt', 
           'STD_IMP_REP_longest_credit_lengt', 'STD_IMP_REP_revol_util', 'STD_IMP_REP_term_length', 
           'STD_IMP_REP_total_acc']
y_name = 'bad_loan'

6. Create train, valid, and test partitions for honest assessment

In [None]:
# ALWAYS set a random seed when working with randomness
# REQUIRES STUDENT INPUT
train_X, valid_test_X, train_y, valid_test_y = train_test_split(raw[x_names], raw[y_name], test_size=0.5, random_state=) # split off training data
valid_X, test_X, valid_y, test_y = train_test_split(valid_test_X, valid_test_y, test_size=0.5, random_state=) # split remainder into validation and test

# summarize 
print('Training data: %i rows and %i columns' % (train_X.shape[0], train_X.shape[1] + 1))
print('Validation data: %i rows and %i columns' % (valid_X.shape[0], valid_X.shape[1] + 1))
print('Testing data: %i rows and %i columns' % (test_X.shape[0], test_X.shape[1] + 1))

# test that splits sum to original data size
assert(raw.shape[0] == train_X.shape[0] + valid_X.shape[0] + test_X.shape[0]) 
assert(raw.shape[0] == train_y.shape[0] + valid_y.shape[0] + test_y.shape[0]) 

7. Train decision tree model to predict loan default with validation data

In [None]:
# train decision tree 
# with validation-based early stopping
max_depth = 12
candidate_models = {}

for depth in range(0, max_depth):

  # fit tree at a certain depth
  # REQUIRES STUDENT INPUT
  clf = tree.DecisionTreeClassifier(max_depth = depth + 1, random_state=)
  clf.fit(train_X, train_y)

  # calculate AUC
  train_phat = clf.predict_proba(train_X)[:, 1] # score current tree on trainig data
  valid_phat = clf.predict_proba(valid_X)[:, 1] # score current tree on validation data
  train_auc = roc_auc_score(train_y, train_phat) # train AUC
  valid_auc = roc_auc_score(valid_y, valid_phat) # validation AUC

  # calculate cross-validation metrics
  # to assess stability
  cv_scores = cross_val_score(clf, valid_X, valid_y, scoring='roc_auc', cv=5)
  cv_std = np.std(cv_scores)

  # store information about current tree
  candidate_models[depth + 1] = {}
  candidate_models[depth + 1]['Model'] = clf
  candidate_models[depth + 1]['Training AUC'] = train_auc
  candidate_models[depth + 1]['Validation AUC'] = valid_auc
  candidate_models[depth + 1]['5-Fold SD'] = cv_std


8. Generate iteration plot based on AUC

In [None]:
# plot tree depth vs. training and validation AUC
# using simple pandas plotting and matplotlib
candidate_results = pd.DataFrame.from_dict(candidate_models, orient='index')
fig, ax = plt.subplots(figsize=(8, 8))
_ = candidate_results[['Training AUC', 'Validation AUC']].plot(title='Iteration Plot',
                                                               ax=ax)
_ = ax.set_xlabel('Tree Depth')
_ = ax.set_ylabel('AUC')

print(candidate_results.iloc[:, 1:])

9. Select the best model based on the iteration plot

In [None]:
best_model = candidate_models[]['Model'] # REQUIRES STUDENT INPUT

10. Visualize entire tree

In [None]:
# plot the tree for human interpretation
fig = plt.figure(figsize=(400, 70))
_ = tree.plot_tree(best_model,
                   feature_names=x_names,
                   class_names=['Good Loan', 'Bad Loan'],
                   filled=True)

11. Plot feature importance

In [None]:
# plot variable importances
# create pandas dataframe
# plot dataframe values
importances = pd.DataFrame(best_model.feature_importances_,
                           index=x_names,
                           columns=['Importance'])
_ = importances.sort_values(by='Importance').plot(kind='barh', title='Variable Importance', legend=False)

12. Use test performance to estimate real-world performance

In [None]:
# score on test data for 
# final HONEST assessment
# this is not to see how well the model *did*!
# it's to get an assessment of how it *will do* on unseen data!
test_phat = best_model.predict_proba(test_X)[:, 1]
print('Test AUC: %.4f' % roc_auc_score(test_y, test_phat))

13. Generate a prediction for a new customer

In [None]:
# predict on new data -- REQUIRES STUDENT INPUT
new_row = pd.DataFrame.from_dict({
  "GRP_REP_home_ownership": [],
  "GRP_addr_state": [],
  "GRP_purpose": [],
  "GRP_verification_status": [],
  "STD_IMP_REP_annual_inc": [],
  "STD_IMP_REP_delinq_2yrs": [],
  "STD_IMP_REP_dti": [],
  "STD_IMP_REP_emp_length": [],
  "STD_IMP_REP_int_rate": [],
  "STD_IMP_REP_loan_amnt": [],
  "STD_IMP_REP_longest_credit_lengt": [],
  "STD_IMP_REP_revol_util": [],
  "STD_IMP_REP_term_length":[],
  "STD_IMP_REP_total_acc": []},
  orient='columns')

# generate prediction -- REQUIRES STUDENT INPUT
