# ML Training Pipeline

## Load Packages

In [1]:
import pandas as pd
import yaml
import os

import json

import joblib
import importlib

from model_functions_module import train_model, tune_best_algo, explainability_mods
from model_functions_module import compile_algo_metrics, metrics_row

from sklearn import set_config
import alibi

set_config(transform_output="pandas")

## Load Configuration Parameters

In [2]:
with open('params.yaml', 'r') as config_file:
    config_params = yaml.safe_load(config_file)

## Define Functions

## Load Support Files and Data

In [3]:
with open(os.path.join(config_params['project_dir'], config_params['resource_dir'], "col_dict.json"), "r") as cd:
    col_dict = json.loads(cd.read())
with open(os.path.join(config_params['project_dir'], config_params['resource_dir'], "algo_dict.json"), "r") as ad:
    algo_dict = json.loads(ad.read())

train_df = pd.read_parquet(os.path.join(config_params['project_dir'], config_params['data_dir'], 'train.parquet'))
validate_df = pd.read_parquet(os.path.join(config_params['project_dir'], config_params['data_dir'], 'validate.parquet'))
test_df = pd.read_parquet(os.path.join(config_params['project_dir'], config_params['data_dir'], 'test.parquet'))

## Train Baseline Models

In [4]:
Xtrain = train_df.drop(config_params['target_name'], axis=1)
ytrain = train_df[[config_params['target_name']]]

info_dict = Xtrain.dtypes.apply(lambda x: x.name).to_dict()
type_dict = {}
for col, dtype in info_dict.items():
    type_dict.setdefault(dtype, []).append(col)
    
col_list = list(Xtrain.columns)

algo_list = list(algo_dict.keys())

result_dict = {}

for algo_name in algo_list:
    algo_data = algo_dict[algo_name]
    module = importlib.import_module(algo_data['module'])
    algo = getattr(module, algo_data['class'])
    algo_result_dict = train_model(algo, algo_data, col_dict, Xtrain, ytrain)
    result_dict = {**result_dict, **algo_result_dict}

KNeighborsClassifier
Number of collinear feature: 3
Number of collinear feature: 3
Number of collinear feature: 3
Number of collinear feature: 3
Number of collinear feature: 3


KNeighborsClassifier Result Dictionary:
{'fit_time': [0.0994877815246582, 0.15291285514831543, 0.0958712100982666, 0.14492177963256836, 0.09598588943481445], 'score_time': [0.3770272731781006, 0.10036420822143555, 0.10024309158325195, 0.09916424751281738, 0.10172629356384277], 'test_accuracy': [0.971875, 0.9725, 0.975625, 0.9775, 0.97125], 'test_precision': [0.9724310776942355, 0.9712858926342073, 0.9822335025380711, 0.9823008849557522, 0.9688279301745636], 'test_f1': [0.9718221665623044, 0.9725, 0.9754253308128544, 0.9773584905660377, 0.97125], 'test_recall': [0.9712140175219024, 0.9737171464330413, 0.9687108886107635, 0.9724655819774718, 0.9736842105263158], 'test_roc_auc': [0.9876101368908389, 0.990012484394507, 0.9898202965942134, 0.9897374839648188, 0.9903155644722779]}
DecisionTreeClassifier
Number of col



Number of collinear feature: 3




Number of collinear feature: 3




Number of collinear feature: 3




Number of collinear feature: 3


AdaBoostClassifier Result Dictionary:
{'fit_time': [0.2107071876525879, 0.24090218544006348, 0.1934976577758789, 0.24126601219177246, 0.19296479225158691], 'score_time': [0.0622868537902832, 0.06271076202392578, 0.06188392639160156, 0.0628509521484375, 0.06223607063293457], 'test_accuracy': [0.99375, 0.994375, 0.99, 0.9925, 0.990625], 'test_precision': [0.9962264150943396, 0.9912935323383084, 0.9949431099873578, 0.9900373599003736, 0.9887640449438202], 'test_f1': [0.9937264742785445, 0.9943855271366189, 0.989937106918239, 0.9925093632958801, 0.9906191369606003], 'test_recall': [0.9912390488110138, 0.9974968710888611, 0.9849812265331664, 0.9949937421777222, 0.9924812030075187], 'test_roc_auc': [0.9996265619165031, 0.9998687497949216, 0.9995734368334951, 0.9996390619360344, 0.9994765592284952]}
ExtraTreesClassifier




Number of collinear feature: 3
Number of collinear feature: 3
Number of collinear feature: 3
Number of collinear feature: 3
Number of collinear feature: 3


ExtraTreesClassifier Result Dictionary:
{'fit_time': [0.2543790340423584, 0.29968786239624023, 0.2526850700378418, 0.24766325950622559, 0.29374098777770996], 'score_time': [0.06991100311279297, 0.11041522026062012, 0.0699620246887207, 0.07016587257385254, 0.06974005699157715], 'test_accuracy': [0.991875, 0.988125, 0.986875, 0.986875, 0.985625], 'test_precision': [0.9937185929648241, 0.9875, 0.9886934673366834, 0.9874686716791979, 0.986198243412798], 'test_f1': [0.9918495297805643, 0.9881175734834271, 0.986833855799373, 0.986850344395742, 0.9855799373040752], 'test_recall': [0.9899874843554443, 0.9887359198998749, 0.9849812265331664, 0.986232790988736, 0.9849624060150376], 'test_roc_auc': [0.9994343741162096, 0.9993999990624985, 0.9994601554064928, 0.9987187479980437, 0.9990749942187138]}
RandomForestClassifier
Number of collinear f

## Determine Best Model

In [5]:
agg_metrics_df = compile_algo_metrics(result_dict)

agg_metrics_df.to_csv(os.path.join(config_params['project_dir'], config_params['metrics_dir'], 'baseline_metrics.csv'))

best_algo = agg_metrics_df['test_f1_penalized'].idxmax()
best_results = agg_metrics_df.loc[best_algo, ['test_f1_mean',
                                          'test_roc_auc_mean',
                                          'test_recall_mean',
                                          'test_precision_mean',
                                          'test_accuracy_mean']]
print('Best model is:', best_algo)
print("Model's performance was:")
print(best_results)

Best model is: RandomForestClassifier
Model's performance was:
test_f1_mean           0.992357
test_roc_auc_mean      0.999369
test_recall_mean       0.991487
test_precision_mean    0.993244
test_accuracy_mean     0.992375
Name: RandomForestClassifier, dtype: float64


## Train and Tune Best Algorithm

In [6]:
best_algo_data = algo_dict[best_algo]
module = importlib.import_module(best_algo_data['module'])
algo = getattr(module, best_algo_data['class'])

Xvalidate = validate_df.drop(config_params['target_name'], axis=1)
yvalidate = validate_df[[config_params['target_name']]]

print('Tuning best algorithm')
model = tune_best_algo(algo, best_algo_data, col_dict, Xtrain, ytrain)
shap_explainer, anchor_explainer = explainability_mods(model, Xtrain)
tuned_val_metrics = metrics_row(yvalidate, model, Xvalidate)
tuned_train_metrics = metrics_row(ytrain, model, Xtrain)

print("Tuned Model's preformance on training data:")
print(tuned_train_metrics)

print("Tuned Model's preformance on validation data:")
print(tuned_val_metrics)

pd.DataFrame(tuned_train_metrics).to_csv(os.path.join(config_params['project_dir'], config_params['metrics_dir'], "tuned_1train_metrics.csv"))
pd.DataFrame(tuned_val_metrics).to_csv(os.path.join(config_params['project_dir'], config_params['metrics_dir'], "tuned_2validation_metrics.csv"))

print('Saving model')
# This is done so that it is all put into a tar.gz file that can be used at inference
# processor_steps = model.named_steps['all_preprocess']
# model_step = model.named_steps['algorithm']
model_dir = "model_artifacts"
os.makedirs(model_dir, exist_ok=True)
# preprocessor_filepath = os.path.join(model_dir, 'preprocessor.joblib')
model_filepath = os.path.join(model_dir, 'model.joblib')
shap_filepath = os.path.join(model_dir, 'shap_explainer.joblib')
# joblib.dump(processor_steps, preprocessor_filepath)
# joblib.dump(model_step, model_filepath)
joblib.dump(model, model_filepath)
joblib.dump(shap_explainer, shap_filepath)

# s3_client.upload_file(Filename=preprocessor_filepath, Bucket=bucket, Key=os.path.join(model_path, "preprocessor.joblib"))
# s3_client.upload_file(Filename=model_filepath, Bucket=bucket, Key=os.path.join(model_path, "model.joblib"))

# model_path = os.path.join(config_params['project_dir'], "05_model_artifacts")
# s3_client.upload_file(Filename=model_filepath, Bucket=bucket, Key=os.path.join(model_path, "model.joblib"))
# s3_client.upload_file(Filename=shap_filepath, Bucket=bucket, Key=os.path.join(model_path, "shap_explainer.joblib"))

anchor_local_path = os.path.join(model_dir, 'anchor_explainer')
alibi.saving.save_explainer(anchor_explainer, anchor_local_path)

# anchor_s3_path = os.path.join(model_dir, "anchor_explainer")
# for root, _, files in os.walk():
#     for filename in files:
#         local_filepath = os.path.join(root, filename)
#         s3_client.upload_file(Filename=local_filepath, Bucket=bucket, Key=os.path.join(anchor_s3_path, filename))

Tuning best algorithm
Tuning for hyperparameters
Training model
Fitting 5 folds for each of 10 candidates, totalling 50 fits
Number of collinear feature: 3
[CV 1/5] END algorithm__bootstrap=False, algorithm__criterion=entropy, algorithm__max_depth=100, algorithm__max_features=0.25, algorithm__max_leaf_nodes=10, algorithm__min_samples_leaf=1, algorithm__min_samples_split=100, algorithm__n_estimators=50, algorithm__random_state=12, algorithm__warm_start=False; auc: (test=0.999) f1: (test=0.989) total time=   0.3s
Number of collinear feature: 3
[CV 2/5] END algorithm__bootstrap=False, algorithm__criterion=entropy, algorithm__max_depth=100, algorithm__max_features=0.25, algorithm__max_leaf_nodes=10, algorithm__min_samples_leaf=1, algorithm__min_samples_split=100, algorithm__n_estimators=50, algorithm__random_state=12, algorithm__warm_start=False; auc: (test=1.000) f1: (test=0.989) total time=   0.2s
Number of collinear feature: 3
[CV 3/5] END algorithm__bootstrap=False, algorithm__criterio

## Test Tuned Model

In [8]:
# with open(os.path.join(config_params['project_dir'], config_params['resource_dir'], 'test.json'), "r") as xf:
#     contents = json.loads(xf.read())
# test_df = pd.DataFrame(pd.read_json(contents))

Xtest = test_df.drop(config_params['target_name'], axis=1)
ytest = test_df[[config_params['target_name']]]

tuned_model = joblib.load(model_filepath)

tuned_test_metrics = metrics_row(ytest, tuned_model, Xtest)
print(tuned_test_metrics)
pd.DataFrame(tuned_test_metrics).to_csv(os.path.join(config_params['project_dir'], config_params['metrics_dir'], "tuned_3test_metrics.csv"))

{'Accuracy': 0.994, 'Precision': 0.998, 'F1': 0.994, 'Recall': 0.99, 'ROC AUC': 0.994, 'conf_matrix': [['True Negative\n\nCount: 499\nActl Rate: 0.998', 'False Positive\n\nCount: 1\nActl Rate: 0.002'], ['False Negative\n\nCount: 5\nActl Rate: 0.01', 'True Positive\n\nCount: 495\nActl Rate: 0.99']]}
