# SM10: Evaluate Model Script - addtl metrics

## To Do

- Fix charts
- Fix the 3 broken metrics, probably need to be charts

In [9]:
%%writefile evaluate_extd.py
import json
import logging
import pathlib
import pickle
import tarfile
import numpy as np
import pandas as pd
import xgboost

from sklearn.metrics import(
    accuracy_score,
#     precision_score,
#     recall_score,
    confusion_matrix,
    roc_curve,
#     mine
    auc, 
    precision_recall_curve, 
    precision_score,
    average_precision_score,
    roc_auc_score,
    log_loss,
    f1_score,
    recall_score,
    roc_curve,
    make_scorer,
    balanced_accuracy_score,
    cohen_kappa_score,
    matthews_corrcoef,
    fbeta_score)

logger = logging.getLogger()
logger.setLevel(logging.INFO)
logger.addHandler(logging.StreamHandler())

if __name__ == "__main__":
    model_path = "/opt/ml/processing/model/model.tar.gz"
    with tarfile.open(model_path) as tar:
        tar.extractall(path="..")
        
    logger.debug("Loading xgboost model")
    model = pickle.load(open("xgboost-model", "rb"))
    
    logger.debug("Loading test input data")
    test_path = "/opt/ml/processing/test/test_feats.csv"
    df = pd.read_csv(test_path)
    
    logger.debug("Reading test data")
    y_test = df.iloc[:, 0].to_numpy()
    df.drop(df.columns[0], axis=1, inplace=True)
    X_test = xgboost.DMatrix(df.values)
    
    logger.info("Performing predictions against test data")
    prediction_probabilities = model.predict(X_test)
    predictions = np.round(prediction_probabilities)
    
#     precision = precision_score(y_test, predictions)
#     recall = recall_score(y_test, predictions)
    accuracy = accuracy_score(y_test, predictions)
    conf_matrix = confusion_matrix(y_test, predictions)
    fpr, tpr, _ = roc_curve(y_test, prediction_probabilities)
    
    auc_score = auc(fpr, tpr) 
    pr_curve = precision_recall_curve(y_test, predictions)
    precision = precision_score(y_test, predictions)
    avg_precision = average_precision_score(y_test, predictions)
    roc_auc = roc_auc_score(y_test, predictions)
#     log_loss_score = log_loss(y_test, predictions)
    f1 = f1_score(y_test, predictions)
    recall = recall_score(y_test, predictions)
    roc = roc_curve(y_test, predictions)
    informedness = balanced_accuracy_score(y_test, predictions, adjusted=True)
    cohen_kappa = cohen_kappa_score(y_test, predictions)
    matthews_coef = matthews_corrcoef(y_test, predictions)
    fbeta = fbeta_score(y_test, predictions, beta=0.5)
        
    logger.debug("Accuracy: {}".format(accuracy))
#     logger.debug("Precision: {}".format(precision))
#     logger.debug("Recall: {}".format(recall))
    logger.debug("Confusion matrix: {}".format(conf_matrix))
    
    logger.debug("AUC: {}".format(auc_score))
    logger.debug("Precision Recall Curve: {}".format(pr_curve))
    logger.debug("Precision: {}".format(precision))
    logger.debug("Average Percision: {}".format(avg_precision))
    logger.debug("ROC AUC: {}".format(roc_auc))
#     logger.debug("Log loss: {}".format(log_loss_score))
    logger.debug("F1: {}".format(f1))
    logger.debug("Recall: {}".format(recall))
    logger.debug("ROC: {}".format(roc))
    logger.debug("informedness: {}".format(informedness))
    logger.debug("Cohen Kappa: {}".format(cohen_kappa))
    logger.debug("Mathews Correlation Coefficient: {}".format(matthews_coef))
    logger.debug("Fbeta: {}".format(fbeta))
    
    report_dict = {
        "binary_classification_metrics": {
            "auc": {"value":auc_score, "standard_deviation":"NaN"},
            "precision": {"value":precision, "standard_deviation":"NaN"},
            "avg_percision": {"value":avg_precision, "standard_deviation":"NaN"},
            "roc_auc": {"value":roc_auc, "standard_deviation":"NaN"},
#             "log_loss": {"value":log_loss_score, "standard_deviation":"NaN"},
            "f1": {"value":f1, "standard_deviation":"NaN"},
            "recall": {"value":recall, "standard_deviation":"NaN"},
#             "roc": {"value":roc, "standard_deviation":"NaN"},
            "informedness": {"value":informedness, "standard_deviation":"NaN"},
            "cohen_kappa": {"value":cohen_kappa, "standard_deviation":"NaN"},
            "mathews_coef": {"value":matthews_coef, "standard_deviation":"NaN"},
            "fbeta": {"value":fbeta, "standard_deviation":"NaN"},
            "accuracy": {"value":accuracy, "standard_deviation":"NaN"},
#             "pr_curve": {"0": {"0": int(pr_curve[0][0]), "1": int(pr_curve[0][1])},
#                          "1": {"0": int(pr_curve[1][0]), "1": int(pr_curve[1][1])},
            "confusion_matrix": {"0": {"0": int(conf_matrix[0][0]), "1": int(conf_matrix[0][1])},
                                 "1": {"0": int(conf_matrix[1][0]), "1": int(conf_matrix[1][1])}
                                },
            "receiver_operating_charastic_curve": {
                "false_positive_rates": list(fpr),
                "true_positive_rates": list(tpr)
            }
        }
    }
    
    print(report_dict)
    output_dir = "/opt/ml/processing/evaluation"
    pathlib.Path(output_dir).mkdir(parents=True, exist_ok=True)
    
    evaluation_path = f'{output_dir}/evaluation.json'
    with open(evaluation_path, "w") as f:
        f.write(json.dumps(report_dict))        

Overwriting evaluate_extd.py


In [10]:
import sagemaker
import sagemaker.session

from sagemaker.workflow.parameters import (
    ParameterInteger,
    ParameterString,
)

from sagemaker.sklearn.processing import SKLearnProcessor
from sagemaker.processing import ProcessingInput, ProcessingOutput
from sagemaker.workflow.steps import ProcessingStep

from sagemaker.sklearn.estimator import SKLearn
from sagemaker.workflow.functions import Join
from sagemaker.workflow.execution_variables import ExecutionVariables

from sagemaker.inputs import TrainingInput
from sagemaker.workflow.steps import TrainingStep
from sagemaker.estimator import Estimator
from sagemaker.workflow.pipeline import Pipeline

from sagemaker.processing import ScriptProcessor
from sagemaker.workflow.properties import PropertyFile

from sagemaker.model_metrics import MetricsSource, ModelMetrics
from sagemaker.workflow.step_collections import RegisterModel

session = sagemaker.session.Session()
region = session.boto_region_name
role = sagemaker.get_execution_role()
bucket = session.default_bucket()
prefix = '1_ins_dataset'

pipeline_name = "InsExample"  # SageMaker Pipeline name
model_package_group_name = "Insurance-Co-Example"  # Model name in model registry
framework_version = "0.23-1"

train_uri = f's3://{bucket}/{prefix}/final/train/train_feats.csv'
validate_uri = f's3://{bucket}/{prefix}/final/validate/validate_feats.csv'
test_uri = f's3://{bucket}/{prefix}/final/test/test_feats.csv'


# tags = [
#     {"Key": "DATASET", "Value": "InsCOIL"},
#     {"Key": "SOURCE", "Value": "UCI"}
#    ]

processing_instance_count = ParameterInteger(name="ProcessingInstanceCount", default_value=1)
processing_instance_type = ParameterString(
    name="ProcessingInstanceType", default_value="ml.t3.medium")

training_instance_type = ParameterString(
    name="TrainingInstanceType", default_value="ml.m5.xlarge")
    
train_data = ParameterString(
    name="TrainData",
    default_value=train_uri
)
validate_data = ParameterString(
    name="ValidateData",
    default_value=validate_uri
)
test_data = ParameterString(
    name="TestData",
    default_value=test_uri
)

model_approval_status = ParameterString(
    name='ModelApprovalStatus',
    default_value='PendingManualApproval'
)

image_uri = sagemaker.image_uris.retrieve(
    framework='xgboost',
    region=region,
    version='1.2-2',
    py_version='py3',
    instance_type='ml.m5.xlarge')

xgb_estimator = Estimator(
    image_uri=image_uri,
    instance_type=training_instance_type,
    instance_count=1,
    role=role,
    disable_profiler=True,
    output_path=Join(
        on="/",
        values=[
            "s3://{}".format(bucket),
            prefix,
            ExecutionVariables.PIPELINE_EXECUTION_ID,
            "model"],
            ))

xgb_estimator.set_hyperparameters(
    max_depth=5,
    eta=0.2,
    gamma=4,
    min_child_weight=6,
    subsample=0.8,
    objective='binary:logistic',
    num_round=25)

step_train = TrainingStep(
    name='train_model',
    estimator=xgb_estimator,
    inputs={
        'train':TrainingInput(
            s3_data=train_data,
            content_type='text/csv'),
        'validation':TrainingInput(
            s3_data=validate_data,
            content_type='text/csv')
            })

evaluate_model_processor = ScriptProcessor(
    image_uri=image_uri,
    command=["python3"],
    instance_type=processing_instance_type,
    instance_count=processing_instance_count,
    role=role,
    base_job_name="ins-example-job")

evaluation_report = PropertyFile(
    name='EvaluationReport',
    output_name='evaluation',
    path='evaluation.json')

step_evaluate = ProcessingStep(
    name='evaluate_model',
    processor=evaluate_model_processor,
    inputs=[
        ProcessingInput(
            source=step_train.properties.ModelArtifacts.S3ModelArtifacts,
            destination="/opt/ml/processing/model"
        ),
        ProcessingInput(
            source=test_data,
            destination="/opt/ml/processing/test"
        )
    ],
    outputs = [
        ProcessingOutput(
            output_name='evaluation',
            source='/opt/ml/processing/evaluation',
            destination=Join(
                on='/',
                values=[
                    's3://{}'.format(bucket),
                    prefix,
                    ExecutionVariables.PIPELINE_EXECUTION_ID,
                    'evaluation-report']
            )
        )
    ],
    code='evaluate_extd.py',
    property_files=[evaluation_report]
)

model_metrics = ModelMetrics(
    model_statistics=MetricsSource(
        s3_uri=Join(
            on='/',
            values=[
                step_evaluate.arguments["ProcessingOutputConfig"]["Outputs"][0]['S3Output']['S3Uri'],
                'evaluation.json']
        ),
        content_type='application/json')
)

step_register = RegisterModel(
    name='register-model',
    estimator=xgb_estimator,
    model_data=step_train.properties.ModelArtifacts.S3ModelArtifacts,
    content_types=['text/csv'],
    response_types=['text/csv'],
    inference_instances=['ml.t2.medium', 'ml.m5.xlarge', 'ml.m5.large'],
    transform_instances=['ml.m5.xlarge'],
    model_package_group_name=model_package_group_name,
    approval_status=model_approval_status,
    model_metrics=model_metrics)

pipeline = Pipeline(
    name=pipeline_name,
    parameters=[
        processing_instance_type,
        processing_instance_count,
        training_instance_type,
        train_data,
        validate_data,
        test_data,
        model_approval_status
    ],
    steps=[step_train, step_evaluate, step_register])

pipeline.upsert(role_arn=role, tags=tags)

pipeline.start(execution_display_name="InsPrebuiltModelEvalExtd-8")

No finished training job found associated with this estimator. Please make sure this estimator is only used for building workflow config


_PipelineExecution(arn='arn:aws:sagemaker:us-east-1:707031497630:pipeline/insexample/execution/tp906pdgp244', sagemaker_session=<sagemaker.session.Session object at 0x7f0a45e8fac0>)

In [None]:
import pandas as pd
import numpy as np
import boto3
import logging
import os
import warnings
import joblib
import argparse

warnings.simplefilter("once")

from sklearn.model_selection import train_test_split, KFold, cross_validate
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import IsolationForest
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier

# from hyperopt import hp
from sklearn.metrics import auc, precision_recall_curve, precision_score, average_precision_score
from sklearn.metrics import roc_auc_score, log_loss, f1_score, recall_score, roc_curve
from sklearn.metrics import make_scorer, balanced_accuracy_score, cohen_kappa_score, matthews_corrcoef, fbeta_score

In [None]:
def get_data(feats_filepath, gt_filepath):
    feats = pd.read_parquet(feats_filepath).set_index(index_col)
    target = pd.read_parquet(gt_filepath).set_index(index_col).fillna(0)
    full_df = feats.merge(target, on=index_col, how='inner')
    feats = full_df.drop(columns=list(target.columns)).reset_index()
    target = full_df[list(target.columns)].reset_index()
    return feats, target

index_col = 'transaction_id'

informedness = make_scorer(balanced_accuracy_score, adjusted=True)
kappa = make_scorer(cohen_kappa_score)
mcc = make_scorer(matthews_corrcoef)
fbet = make_scorer(fbeta_score, beta=0.5)

In [None]:
import sagemaker

session = sagemaker.session.Session()
region = session.boto_region_name
role = sagemaker.get_execution_role()
bucket = 'asurion-fraud-data-science-prod'
prefix = "split_data/2022-10-31"
event_param = 'fulfill'
index_col = 'transaction_id'

In [None]:
train_filepath = f's3://{bucket}/{prefix}/fulfill/final/train/train_feats.parquet'
gt_filepath = f's3://{bucket}/{prefix}/fulfill/processed/ground_truth/org_fraud_gt.parquet'

In [None]:
print('Training XGBoost model')
estimator = XGBClassifier()
estimator.fit(train_feats.dropna(axis=1, how='all').set_index(index_col), train_target['org_fraud'])
print('Finished training')

In [None]:
print('Cross-Validating model with {} samples'.format(train_feats.shape[0]))
report1_dict = cross_validate(estimator, train_feats.dropna(how='all', axis=1).set_index(index_col),
                              train_target['org_fraud'], cv=5, return_estimator=True, error_score="raise",
                              scoring=('roc_auc', 'average_precision', 'precision', 'recall', 'f1', 'neg_log_loss', 'neg_brier_score'))
print("roc_auc and average_percision are the important ones here")
print(report1_dict)

report2_dict = cross_validate(estimator, train_feats.dropna(how='all', axis=1).set_index(index_col),
                              train_target['org_fraud'], cv=5, return_estimator=True, error_score="raise",
                              scoring={'informedness':informedness, 'cohen_kappa':kappa, 'matthews_corr':mcc, 'f_beta':fbet})
print("Additional metrics")
print(report2_dict)

In [None]:
if __name__ == '__main__':
#     parser = argparse.ArgumentParser()
    
#     parser.add_argument('--output-data-dir', type=str, default=os.environ.get('SM_OUTPUT_DATA_DIR'))
#     parser.add_argument('--model-dir', type=str, default=os.environ.get('SM_MODEL_DIR'))
#     parser.add_argument('--feats-dir', type=str, default=os.environ.get('SM_CHANNEL_TRAIN_DATA'))
#     parser.add_argument('--labels-dir', type=str, default=os.environ.get('SM_CHANNEL_TRAIN_LABELS'))
    
#     args = parser.parse_args()
    
#     print("Args: {}".format(args))
    
    print('Saving model')
    joblib.dump(estimator, os.path.join(args.model_dir, 'model.joblib'))
    
def model_fn(model_dir):
    """Deserialize fitted model
    """
    model = joblib.load(os.path.join(model_dir, "model.joblib"))
    return model
    
def input_fn(input_data, content_type):
    """Parse input data payload

    We currently take csv, parquet, and json input. We only process data without the target value.
    """
    if content_type == 'text/csv':
        # Read the raw input data as CSV.
        df = pd.read_csv(input_data)
    elif content_type = 'application/x-parquet':
        df = pd.read_parquet(input_data)
    elif content_type = 'application/json':
        df = pd.read_json(input_data)
    else:
        raise ValueError("{} not supported by script!".format(content_type))
    return df

def predict_fn(input_data, model):
    """Preprocess input data

    We implement this because the default predict_fn uses .predict(), but our model is a preprocessor
    so we want to use .transform().

    The output is returned in the following order:

        rest of features either one hot encoded or standardized
    """
    
    print('Making predictions')
    preds = model.predict(input_data)
    return preds

def output_fn(prediction, response_content_type):
    """Format prediction output

    The default accept/content-type between containers for serial inference is JSON.
    We also want to set the ContentType or mimetype as the same value as accept so the next
    container can read the response payload correctly.
    """
    if response_content_type == "application/json":
        return worker.Response(json.dumps(prediction), mimetype=response_content_type)
    elif accept == 'text/csv':
        df_response = pd.DataFrame(prediction)
        csv_response = df_response.to_csv(index=False)
        return worker.Response(csv_response, mimetype=response_content_type)
    else:
        raise RuntimeException("{} accept type is not supported by this script.".format(response_content_type))