# Train with provided features

## Load data to S3

In [12]:
# Load data to S3
import pandas as pd
import sagemaker

session = sagemaker.session.Session()
bucket = session.default_bucket()
prefix = '1_ins_dataset/raw'

train_uri = 'https://archive.ics.uci.edu/ml/machine-learning-databases/tic-mld/ticdata2000.txt'
test_uri = 'https://archive.ics.uci.edu/ml/machine-learning-databases/tic-mld/ticeval2000.txt'
gt_uri = 'https://archive.ics.uci.edu/ml/machine-learning-databases/tic-mld/tictgts2000.txt'
cols_uri = 'https://archive.ics.uci.edu/ml/machine-learning-databases/tic-mld/dictionary.txt'

train = pd.read_table(train_uri, header=None)
test = pd.read_table(test_uri, header=None)
ground_truth = pd.read_table(gt_uri, header=None)
columns = pd.read_table(cols_uri, encoding='latin-1')

train.to_csv(f's3://{bucket}/{prefix}/raw/train.csv', index=False)
test.to_csv(f's3://{bucket}/{prefix}/raw/test.csv', index=False)
ground_truth.to_csv(f's3://{bucket}/{prefix}/raw/gt.csv', index=False)
columns.to_csv(f's3://{bucket}/{prefix}/raw/col_info.csv', index=False)

## Processing Script

In [2]:
%%writefile load_split.py
import pandas as pd
import numpy as np
import joblib
import os

if __name__ == '__main__':
    input_path = '/opt/ml/processing/input'
    output_path = '/opt/ml/processing/output'
 
    try:
        os.makedirs(os.path.join(output_path, "train"))
        os.makedirs(os.path.join(output_path, "validate"))
        os.makedirs(os.path.join(output_path, "test"))
        os.makedirs(os.path.join(output_path, 'encoder'))
    except:
        pass
    
    print('Read in data')
    train = pd.read_csv(os.path.join(input_path, 'train.csv'))
    test = pd.read_csv(os.path.join(input_path, 'test.csv'))
    ground_truth = pd.read_csv(os.path.join(input_path, 'gt.csv'))
    columns = pd.read_csv(os.path.join(input_path, 'col_info.csv'))
    
    print('Getting column names')
    col_name_df = columns.iloc[1:87, 0].str.split(n=2, expand=True)
    col_name_df.columns = columns.iloc[0, 0].split(maxsplit=2)
    col_names = col_name_df['Name'].to_list()
    print(col_names)

    print('Combining data')
    test_df = pd.concat([test, ground_truth], axis=1)
    test_df.columns = col_names
    print(test_df.head())
    train.columns = col_names
    print(train.head())
    combined_df = pd.concat([train, test_df], ignore_index=True)
    print(combined_df.head())
#     df = pd.DataFrame(combined_df.iloc[:, -1]).merge(combined_df.iloc[:,0:-1], axis=1)
    df = pd.DataFrame(combined_df['CARAVAN']).merge(combined_df.drop('CARAVAN', axis=1), left_index=True, right_index=True)
    print(df.head())
    
    print('Splitting data')
    train_data, validation_data, test_data = np.split(
        df.sample(frac=1, random_state=1729),
        [int(0.7 * len(df)), int(0.9 * len(df))],)
    
    print('Saving dataframe')
    train_data.to_csv(os.path.join(output_path, 'train', 'train_feats.csv'), index=False)
    validation_data.to_csv(os.path.join(output_path, 'validate', 'validate_feats.csv'), index=False)
    test_data.to_csv(os.path.join(output_path, 'test', 'test_feats.csv'), index=False)

Overwriting load_split.py


## Evaluate model script

In [14]:
%%writefile evaluate_extd.py
import json
import logging
import pathlib
import pickle
import tarfile
import numpy as np
import pandas as pd
import xgboost

from sklearn.metrics import(
    accuracy_score,
#     precision_score,
#     recall_score,
    confusion_matrix,
    roc_curve,
#     mine
    auc, 
    precision_recall_curve, 
    precision_score,
    average_precision_score,
    roc_auc_score,
    log_loss,
    f1_score,
    recall_score,
    roc_curve,
    make_scorer,
    balanced_accuracy_score,
    cohen_kappa_score,
    matthews_corrcoef,
    fbeta_score)

logger = logging.getLogger()
logger.setLevel(logging.INFO)
logger.addHandler(logging.StreamHandler())

if __name__ == "__main__":
    model_path = "/opt/ml/processing/model/model.tar.gz"
    with tarfile.open(model_path) as tar:
        tar.extractall(path="..")
        
    logger.debug("Loading xgboost model")
    model = pickle.load(open("xgboost-model", "rb"))
    
    logger.debug("Loading test input data")
    test_path = "/opt/ml/processing/test/test_feats.csv"
    df = pd.read_csv(test_path)
    
    logger.debug("Reading test data")
    y_test = df.iloc[:, 0].to_numpy()
    df.drop(df.columns[0], axis=1, inplace=True)
    X_test = xgboost.DMatrix(df.values)
    
    logger.info("Performing predictions against test data")
    prediction_probabilities = model.predict(X_test)
    predictions = np.round(prediction_probabilities)
    
#     precision = precision_score(y_test, predictions)
#     recall = recall_score(y_test, predictions)
    accuracy = accuracy_score(y_test, predictions)
    conf_matrix = confusion_matrix(y_test, predictions)
    fpr, tpr, _ = roc_curve(y_test, prediction_probabilities)
    
    auc_score = auc(fpr, tpr) 
    pr_curve = precision_recall_curve(y_test, predictions)
    precision = precision_score(y_test, predictions)
    avg_precision = average_precision_score(y_test, predictions)
    roc_auc = roc_auc_score(y_test, predictions)
#     log_loss_score = log_loss(y_test, predictions)
    f1 = f1_score(y_test, predictions)
    recall = recall_score(y_test, predictions)
    roc = roc_curve(y_test, predictions)
    informedness = balanced_accuracy_score(y_test, predictions, adjusted=True)
    cohen_kappa = cohen_kappa_score(y_test, predictions)
    matthews_coef = matthews_corrcoef(y_test, predictions)
    fbeta = fbeta_score(y_test, predictions, beta=0.5)
        
    logger.debug("Accuracy: {}".format(accuracy))
#     logger.debug("Precision: {}".format(precision))
#     logger.debug("Recall: {}".format(recall))
    logger.debug("Confusion matrix: {}".format(conf_matrix))
    
    logger.debug("AUC: {}".format(auc_score))
    logger.debug("Precision Recall Curve: {}".format(pr_curve))
    logger.debug("Precision: {}".format(precision))
    logger.debug("Average Percision: {}".format(avg_precision))
    logger.debug("ROC AUC: {}".format(roc_auc))
#     logger.debug("Log loss: {}".format(log_loss_score))
    logger.debug("F1: {}".format(f1))
    logger.debug("Recall: {}".format(recall))
    logger.debug("ROC: {}".format(roc))
    logger.debug("informedness: {}".format(informedness))
    logger.debug("Cohen Kappa: {}".format(cohen_kappa))
    logger.debug("Mathews Correlation Coefficient: {}".format(matthews_coef))
    logger.debug("Fbeta: {}".format(fbeta))
    
    report_dict = {
        "binary_classification_metrics": {
            "auc": {"value":auc_score, "standard_deviation":"NaN"},
            "precision": {"value":precision, "standard_deviation":"NaN"},
            "avg_percision": {"value":avg_precision, "standard_deviation":"NaN"},
            "roc_auc": {"value":roc_auc, "standard_deviation":"NaN"},
#             "log_loss": {"value":log_loss_score, "standard_deviation":"NaN"},
            "f1": {"value":f1, "standard_deviation":"NaN"},
            "recall": {"value":recall, "standard_deviation":"NaN"},
#             "roc": {"value":roc, "standard_deviation":"NaN"},
            "informedness": {"value":informedness, "standard_deviation":"NaN"},
            "cohen_kappa": {"value":cohen_kappa, "standard_deviation":"NaN"},
            "mathews_coef": {"value":matthews_coef, "standard_deviation":"NaN"},
            "fbeta": {"value":fbeta, "standard_deviation":"NaN"},
            "accuracy": {"value":accuracy, "standard_deviation":"NaN"},
#             "pr_curve": {"0": {"0": int(pr_curve[0][0]), "1": int(pr_curve[0][1])},
#                          "1": {"0": int(pr_curve[1][0]), "1": int(pr_curve[1][1])},
            "confusion_matrix": {"0": {"0": int(conf_matrix[0][0]), "1": int(conf_matrix[0][1])},
                                 "1": {"0": int(conf_matrix[1][0]), "1": int(conf_matrix[1][1])}
                                },
            "receiver_operating_charastic_curve": {
                "false_positive_rates": list(fpr),
                "true_positive_rates": list(tpr)
            }
        }
    }
    
    print(report_dict)
    output_dir = "/opt/ml/processing/evaluation"
    pathlib.Path(output_dir).mkdir(parents=True, exist_ok=True)
    
    evaluation_path = f'{output_dir}/evaluation.json'
    with open(evaluation_path, "w") as f:
        f.write(json.dumps(report_dict))        

Overwriting evaluate_extd.py


## Pipeline

In [3]:
import sagemaker
import sagemaker.session

from sagemaker.workflow.parameters import (
    ParameterInteger,
    ParameterString,
)

from sagemaker.sklearn.processing import SKLearnProcessor
from sagemaker.processing import ProcessingInput, ProcessingOutput
from sagemaker.workflow.steps import ProcessingStep

from sagemaker.sklearn.estimator import SKLearn
from sagemaker.workflow.functions import Join
from sagemaker.workflow.execution_variables import ExecutionVariables

from sagemaker.inputs import TrainingInput
from sagemaker.workflow.steps import TrainingStep
from sagemaker.estimator import Estimator
from sagemaker.tuner import HyperparameterTuner, ContinuousParameter, IntegerParameter
from sagemaker.workflow.steps import TuningStep
from sagemaker.workflow.pipeline import Pipeline

from sagemaker.processing import ScriptProcessor
from sagemaker.workflow.properties import PropertyFile

from sagemaker.model_metrics import MetricsSource, ModelMetrics
from sagemaker.workflow.step_collections import RegisterModel

session = sagemaker.session.Session()
region = session.boto_region_name
role = sagemaker.get_execution_role()
bucket = session.default_bucket()
prefix = '1_ins_dataset'

pipeline_name = "InsExample"  # SageMaker Pipeline name
model_package_group_name = "Insurance-Co-Example"  # Model name in model registry
framework_version = "0.23-1"

tags = [
    {"Key": "DATASET", "Value": "InsCOIL"},
    {"Key": "SOURCE", "Value": "UCI"}
   ]

processing_instance_count = ParameterInteger(name="ProcessingInstanceCount", default_value=1)
processing_instance_type = ParameterString(
    name="ProcessingInstanceType", default_value="ml.t3.medium")

training_instance_type = ParameterString(
    name="TrainingInstanceType", default_value="ml.m5.xlarge")

max_training_jobs = ParameterInteger(name='MaximumTrainingJobs', default_value=1)
max_parallel_training_jobs = ParameterInteger('MaxParallelTrainingJobs', default_value=1)


input_uri = f's3://{bucket}/{prefix}/raw'
    
input_data = ParameterString(
    name="InputData",
    default_value=input_uri
)

sklearn_processor = SKLearnProcessor(
    framework_version=framework_version,
    role=role,
    instance_type=processing_instance_type,
    instance_count=processing_instance_count,
    base_job_name="ins-example-job"
)

step_preprocess = ProcessingStep(
    name="preprocess",
    processor=sklearn_processor,
    inputs=[
        ProcessingInput(source=input_data, destination="/opt/ml/processing/input")
    ],
    outputs=[
        ProcessingOutput(
            output_name="train",
            source="/opt/ml/processing/output/train",
            destination=Join(
                on="/",
                values=[
                    "s3://{}".format(bucket),
                    prefix,
                    'no_encoder',
                    "train"
                ],
            ),
        ),
        ProcessingOutput(
            output_name="validate",
            source="/opt/ml/processing/output/validate",
            destination=Join(
                on="/",
                values=[
                    "s3://{}".format(bucket),
                    prefix,
                    'no_encoder',
                    "validate"
                ],
            ),
        ),
        ProcessingOutput(
            output_name="test",
            source="/opt/ml/processing/output/test",
            destination=Join(
                on="/",
                values=[
                    "s3://{}".format(bucket),
                    prefix,
                    'no_encoder',
                    "test"
                ],
            ),
        ),
    ],
    code="load_split.py"
)


model_approval_status = ParameterString(
    name='ModelApprovalStatus',
    default_value='PendingManualApproval'
)

image_uri = sagemaker.image_uris.retrieve(
    framework='xgboost',
    region=region,
    version='1.2-2',
    py_version='py3',
    instance_type='ml.m5.xlarge')

xgb_estimator = Estimator(
    image_uri=image_uri,
    instance_type=training_instance_type,
    instance_count=1,
    role=role,
    disable_profiler=True)


xgb_tuner = HyperparameterTuner(
    estimator=xgb_estimator,
    objective_metric_name="validation:f1",
    hyperparameter_ranges={
        'max_depth': IntegerParameter(1, 10),
        'eta': ContinuousParameter(0, 0.5),
        'gamma': ContinuousParameter(0, 5),
        'min_child_weight': ContinuousParameter(1, 120),
        'num_round': IntegerParameter(1, 2000)
    },
    max_jobs=max_training_jobs,
    max_parallel_jobs=max_parallel_training_jobs)

step_tune = TuningStep(
    name='train-tune-model',
    tuner=xgb_tuner,
    inputs={
        'train':TrainingInput(
            s3_data=step_preprocess.properties.ProcessingOutputConfig.Outputs[
            "train"].S3Output.S3Uri,
            content_type='text/csv'),
        'validation':TrainingInput(
            s3_data=step_preprocess.properties.ProcessingOutputConfig.Outputs[
            "validate"].S3Output.S3Uri,
            content_type='text/csv')})

evaluate_model_processor = ScriptProcessor(
    image_uri=image_uri,
    command=["python3"],
    instance_type=processing_instance_type,
    instance_count=processing_instance_count,
    role=role,
    base_job_name="ins-example-job")

evaluation_report = PropertyFile(
    name='EvaluationReport',
    output_name='evaluation',
    path='evaluation.json')

step_evaluate = ProcessingStep(
    name='evaluate_model',
    processor=evaluate_model_processor,
    inputs=[
        ProcessingInput(
            source=step_tune.get_top_model_s3_uri(top_k=0, s3_bucket=bucket),
            destination="/opt/ml/processing/model"
        ),
        ProcessingInput(
            source=step_preprocess.properties.ProcessingOutputConfig.Outputs[
            "test"].S3Output.S3Uri,
            destination="/opt/ml/processing/test"
        )
    ],
    outputs = [
        ProcessingOutput(
            output_name='evaluation',
            source='/opt/ml/processing/evaluation',
            destination=Join(
                on='/',
                values=[
                    's3://{}'.format(bucket),
                    prefix,
                    ExecutionVariables.PIPELINE_EXECUTION_ID,
                    'evaluation-report']
            )
        )
    ],
    code='evaluate_extd.py',
    property_files=[evaluation_report]
)

model_metrics = ModelMetrics(
    model_statistics=MetricsSource(
        s3_uri=Join(
            on='/',
            values=[
                step_evaluate.arguments["ProcessingOutputConfig"]["Outputs"][0]['S3Output']['S3Uri'],
                'evaluation.json']
        ),
        content_type='application/json')
)

step_register = RegisterModel(
    name='register-model',
    estimator=xgb_estimator,
    model_data=step_tune.get_top_model_s3_uri(top_k=0, s3_bucket=bucket),
    content_types=['text/csv'],
    response_types=['text/csv'],
    inference_instances=['ml.t2.medium', 'ml.m5.xlarge', 'ml.m5.large'],
    transform_instances=['ml.m5.xlarge'],
    model_package_group_name=model_package_group_name,
    approval_status=model_approval_status,
    model_metrics=model_metrics)

pipeline = Pipeline(
    name=pipeline_name,
    parameters=[
        processing_instance_type,
        processing_instance_count,
        training_instance_type,
        max_training_jobs,
        max_parallel_training_jobs,
        input_data,
        model_approval_status
    ],
    steps=[
        step_preprocess,
        step_tune,
        step_evaluate,
        step_register])

pipeline.upsert(role_arn=role, tags=tags)

pipeline.start(execution_display_name="InsNoEncodeModel-2")

The input argument instance_type of function (sagemaker.image_uris.retrieve) is a pipeline variable (<class 'sagemaker.workflow.parameters.ParameterString'>), which is not allowed. The default_value of this Parameter object will be used to override it. Please make sure the default_value is valid.
No finished training job found associated with this estimator. Please make sure this estimator is only used for building workflow config
No finished training job found associated with this estimator. Please make sure this estimator is only used for building workflow config
No finished training job found associated with this estimator. Please make sure this estimator is only used for building workflow config


_PipelineExecution(arn='arn:aws:sagemaker:us-east-1:707031497630:pipeline/insexample/execution/i5so7gcwqvth', sagemaker_session=<sagemaker.session.Session object at 0x7f4b98709040>)