## Load packages

In [39]:
# built-in libraries
from datetime import date
import os
# third-party libraries
import pandas as pd
# local application
from auto_ml_validation.validation_package.process_data import split_x_y, process_data
from auto_ml_validation.validation_package.benchmark_pipeline import auto_benchmark, save_benchmark_output
from auto_ml_validation.validation_package.train_pipeline import train
from auto_ml_validation.validation_package.evaluation.statistical_metrics_evaluator import StatisticalMetricsEvaluator
from auto_ml_validation.validation_package.evaluation.performance_metrics_evaluator import evaluate_performance

ImportError: cannot import name 'evaluate_performance' from 'auto_ml_validation.validation_package.evaluation.performance_metrics_evaluator' (/Users/keira/Desktop/auto-ml-validation/auto_ml_validation/validation_package/evaluation/performance_metrics_evaluator.py)

## Benchmarking pipeline

### Benchmarking setup

Load raw datasets & specify target + categorical columns

In [2]:
train_raw = pd.read_csv('./data/stage_2/loanstats_train.csv')
val_raw = pd.read_csv('./data/stage_2/loanstats_val.csv')
test_raw = pd.read_csv('./data/stage_2/loanstats_test.csv')
target = 'loan_status'
cat_cols = ["home_ownership", "verification_status",
            "hardship_flag", "debt_settlement_flag",
            "initial_list_status", "application_type"]

In [3]:
print(f'Training size: {train_raw.shape}, Validation size: {val_raw.shape}, Testing size: {test_raw.shape}')

Training size: (58147, 76), Validation size: (7268, 76), Testing size: (7269, 76)


Downsampling training data (Note: optional; doing so here only because our raw training data is too large)

In [4]:
neg_df = train_raw[train_raw['loan_status'] == 0]
pos_df = train_raw[train_raw['loan_status'] == 1]
print(neg_df.shape, pos_df.shape)
neg_samp = neg_df.sample(n=15000, random_state=42, ignore_index=True)
print(neg_samp.shape)
train_sm = pd.concat([neg_samp, pos_df], axis=0, ignore_index=True)
# shuffle
train_sm = train_sm.sample(frac=1, random_state=1)
print(train_sm.shape)

(57843, 76) (304, 76)
(15000, 76)
(15304, 76)


In [5]:
train_sm.head()

Unnamed: 0,loan_amnt,int_rate,installment,home_ownership,annual_inc,verification_status,loan_status,dti,delinq_2yrs,inq_last_6mths,...,pct_tl_nvr_dlq,percent_bc_gt_75,pub_rec_bankruptcies,tax_liens,tot_hi_cred_lim,total_bal_ex_mort,total_bc_limit,total_il_high_credit_limit,hardship_flag,debt_settlement_flag
9965,6000,0.1171,198.46,RENT,55000.0,Not Verified,0,19.09,0,0,...,93.8,0.0,0,0,54184,34885,23400,29984,N,N
1873,20000,0.0702,617.73,RENT,81000.0,Not Verified,0,16.22,0,0,...,100.0,16.7,0,0,74334,41232,34600,36734,N,N
3737,28000,0.0881,887.92,MORTGAGE,46950.0,Source Verified,0,58.03,0,0,...,92.0,50.0,1,0,338397,38976,19100,71760,N,N
11359,2500,0.0819,78.57,MORTGAGE,85000.0,Not Verified,0,21.84,0,1,...,88.2,0.0,0,0,369846,46946,15000,71946,N,N
4330,8000,0.1131,263.09,MORTGAGE,110000.0,Source Verified,0,11.82,0,1,...,75.0,25.0,0,0,385109,71647,14700,70709,N,N


In [6]:
print(f'Training size: {train_sm.shape}, Validation size: {val_raw.shape}, Testing size: {test_raw.shape}')

Training size: (15304, 76), Validation size: (7268, 76), Testing size: (7269, 76)


Specify metric for comparing benchmark models: Choose among accuracy, f1, precision, recall, and roc_auc

In [7]:
benchmark_metric = 'f1'

### Data processing

In [8]:
other_dfs = [val_raw, test_raw]
X_train, y_train, others = process_data(train_sm, other_dfs, target, cat_cols)
X_val, y_val = others[0]
X_test, y_test = others[1]

In [9]:
X_train.head()

Unnamed: 0,home_ownership_ANY,home_ownership_MORTGAGE,home_ownership_OWN,home_ownership_RENT,verification_status_Not Verified,verification_status_Source Verified,verification_status_Verified,hardship_flag_N,debt_settlement_flag_N,initial_list_status_f,...,num_tl_90g_dpd_24m,num_tl_op_past_12m,pct_tl_nvr_dlq,percent_bc_gt_75,pub_rec_bankruptcies,tax_liens,tot_hi_cred_lim,total_bal_ex_mort,total_bc_limit,total_il_high_credit_limit
0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,...,-0.146349,-0.116543,-0.162912,-0.930299,-0.375333,0.0,-0.807781,-0.464104,-0.226617,-0.511226
1,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,...,-0.146349,0.413808,0.600788,-0.438634,-0.375333,0.0,-0.702159,-0.349318,0.191562,-0.3724
2,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,...,-0.146349,-0.646893,-0.384632,0.541752,2.605518,0.0,0.682001,-0.390118,-0.387168,0.347972
3,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,...,-0.146349,-0.646893,-0.852706,-0.930299,-0.375333,0.0,0.84685,-0.24598,-0.540251,0.351798
4,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,...,-0.146349,0.413808,-2.478649,-0.194274,-0.375333,0.0,0.926855,0.20074,-0.551453,0.326357


In [10]:
y_train.head()

9965     0
1873     0
3737     0
11359    0
4330     0
Name: loan_status, dtype: int64

In [11]:
print(f'Training size: {X_train.shape}, Validation size: {X_val.shape}, Testing size: {X_test.shape}')

Training size: (15304, 82), Validation size: (7268, 82), Testing size: (7269, 82)


### Auto benchmarking

Demonstrate checking input validity 

In [11]:
best_clf_name, benchmark_output = auto_benchmark(X_train, y_train, X_val, y_val, 'acc', feature_selection=False, n_jobs=-1, mode='parallel', verbose=True)

ValueError: Invalid metric: acc. Please choose from {'accuracy', 'roc_auc', 'precision', 'recall', 'f1'}.

In [12]:
best_clf_name, benchmark_output = auto_benchmark(X_train, y_train, X_val, y_val, benchmark_metric, feature_selection=False, n_jobs=-1, mode='??', verbose=True)

ValueError: Invalid mode: ??. Please choose from "sequential" or "parallel".

Perform benchmarking (Note: feature selection is skipped here to save time)

In [15]:
benchmark_model, benchmark_output = auto_benchmark(X_train, y_train, X_val, y_val, benchmark_metric, feature_selection=False, n_jobs=-1, mode='parallel', verbose=True)

Number of classifiers:  4
Training Decision Tree Classifier.
Fitting 5 folds for each of 8 candidates, totalling 40 fits
Training Logistic Classifier.
Training XGBoost Classifier.
Training Random Forest Classifier.
Fitting 5 folds for each of 8 candidates, totalling 40 fits
Fitting 5 folds for each of 8 candidates, totalling 40 fits
Fitting 5 folds for each of 8 candidates, totalling 40 fits
Best hyperparameters: {'min_samples_split': 2, 'min_samples_leaf': 4, 'max_depth': 8, 'criterion': 'gini'}; best score: 0.5424639005141778.
Best hyperparameters: {'C': 0.017073967431528118}; best score: 0.09724640847192566.
Best hyperparameters: {'colsample_bytree': 0.7296244459829335, 'gamma': 0.16685430556951092, 'learning_rate': 0.008952946643416624, 'max_depth': 8, 'min_child_weight': 1.185260448662222, 'n_estimators': 100, 'subsample': 0.8609993861334124}; best score: 0.5652441511511976.
Best hyperparameters: {'n_estimators': 300, 'min_samples_split': 4, 'min_samples_leaf': 3, 'max_depth': 16,

Check benchmark model

In [16]:
benchmark_model

<auto_ml_validation.validation_package.algorithms.xgboost.XGBoostClassifier at 0x172c9f9d0>

Save benchmarking output

In [17]:
models_dir = f'./outputs/loanstats_{date.today()}'
result_path = f'./outputs/loanstats_{date.today()}/benchmark_output.json'
os.mkdir(models_dir)
save_benchmark_output(benchmark_output, models_dir, result_path)

Successfully saved model to ./outputs/loanstats_2023-03-21/Decision Tree Classifier.pkl.
Successfully saved model to ./outputs/loanstats_2023-03-21/Logistic Classifier.pkl.
Successfully saved model to ./outputs/loanstats_2023-03-21/Random Forest Classifier.pkl.
Successfully saved model to ./outputs/loanstats_2023-03-21/XGBoost Classifier.pkl.




## Replicate the model to be validated

### Training setup

Load processed datasets & split features & target

In [18]:
train_p = pd.read_csv('./data/stage_2/loanstats_train_sm_processed.csv')
val_p = pd.read_csv('./data/stage_2/loanstats_val_processed.csv')
test_p = pd.read_csv('./data/stage_2/loanstats_test_processed.csv')
train_p.head()

Unnamed: 0,home_ownership_ANY,home_ownership_MORTGAGE,home_ownership_OWN,home_ownership_RENT,verification_status_Not Verified,verification_status_Source Verified,verification_status_Verified,hardship_flag_N,debt_settlement_flag_N,initial_list_status_f,...,num_tl_op_past_12m,pct_tl_nvr_dlq,percent_bc_gt_75,pub_rec_bankruptcies,tax_liens,tot_hi_cred_lim,total_bal_ex_mort,total_bc_limit,total_il_high_credit_limit,loan_status
0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,...,-0.119909,-0.143239,-0.925559,-0.373194,0.0,-0.806673,-0.465307,-0.239786,-0.503782,0
1,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,...,0.405869,0.594438,-0.433864,-0.373194,0.0,-0.702652,-0.354174,0.174055,-0.370419,0
2,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,...,-0.645686,-0.357403,0.546582,2.605953,0.0,0.660536,-0.393676,-0.398672,0.321606,0
3,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,...,-0.645686,-0.809528,-0.925559,-0.373194,0.0,0.822887,-0.254125,-0.550168,0.325281,0
4,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,...,0.405869,-2.380066,-0.189488,-0.373194,0.0,0.90168,0.178378,-0.561253,0.300841,0


In [19]:
X_train1, y_train1 = split_x_y(train_p, target)
X_val1, y_val1 = split_x_y(val_p, target)
X_test1, y_test1 = split_x_y(test_p, target)

In [20]:
print(f'Training size: {X_train1.shape}, Validation size: {X_val1.shape}, Testing size: {X_test1.shape}')

Training size: (15304, 82), Validation size: (7268, 82), Testing size: (7269, 82)


Set algorithm (select from {'dt', 'knn', 'logistic', 'rf', 'svm', 'xgboost'}) and hyperparameters

In [21]:
algo = 'rf'
params = {
    "n_estimators": 100,
    "criterion": "gini",
    "max_depth": 8,
    "random_state": 42,
    "class_weight": "balanced"
}

### Training pipeline

In [22]:
save_path = f'{models_dir}/valid_mode.pkl'
valid_model = train(X_train1,y_train1, algo, params, save=True, save_path=save_path)

Training completed.
Successfully saved model to ./outputs/loanstats_2023-03-21/valid_mode.pkl.


## Evaluate

Predict probabilities with benchmark model and valid model

In [30]:
# benchmark
bm_train_proba = benchmark_model.predict_proba(X_train)
bm_test_proba = benchmark_model.predict_proba(X_test)

In [31]:
# validatee
vm_train_proba = valid_model.predict_proba(X_train1)
vm_test_proba = valid_model.predict_proba(X_test1)

### Generic model performance metrics

Benchmark model

In [29]:
# training data
evaluate_performance(bm_train_proba, 0.3, y_true: np.ndarray, X: pd.DataFrame, model: BaseEstimator)

[array([0.31113866, 0.09547744, 0.14385734, ..., 0.34275335, 0.23518106,
        0.13159956]),
 array([0.18543096, 0.13455336, 0.21317888, ..., 0.16268003, 0.09734717,
        0.27762541]),
 array([0.25269176, 0.16497784, 0.24216055, ..., 0.10947752, 0.17218091,
        0.13126698])]