In [None]:
import boto3
import pandas as pd
import numpy as np
from sagemaker import get_execution_role
import sagemaker
from sklearn.model_selection import train_test_split
from sklearn.datasets import fetch_california_housing

sm_boto3 = boto3.client("sagemaker")

sess = sagemaker.Session()

region = sess.boto_session.region_name

bucket = sess.default_bucket()  # this could also be a hard-coded bucket name

prefix = "nextera/monitoring"

print("Using bucket " + bucket)

### Prepare data

In [None]:
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()

df=pd.read_csv("data/X.csv")

df['Categorical'] = encoder.fit_transform(df['Categorical'])
X = df[list(df)[:-1]]
Y = df[list(df)[-1]]

X_train, X_test, y_train, y_test = train_test_split(
    X, Y, test_size=0.25, random_state=42
)

trainX = pd.DataFrame(X_train, columns=list(X))
trainX["Categorical"] = y_train

testX = pd.DataFrame(X_test, columns=list(X))
testX["Categorical"] = y_test

trainX

In [None]:
trainX.to_csv("X_train.csv", index=False)
testX.to_csv("X_test.csv", index=False)

In [None]:
# send data to S3. SageMaker will take training data from s3
trainpath = sess.upload_data(
    path="X_train.csv", bucket=bucket, key_prefix=prefix
)

testpath = sess.upload_data(
    path="X_test.csv", bucket=bucket, key_prefix=prefix
)

### Training script

In [None]:
%%writefile script.py

import argparse
import joblib
import os

import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestRegressor


# inference functions ---------------
def model_fn(model_dir):
    clf = joblib.load(os.path.join(model_dir, "model.joblib"))
    return clf


if __name__ == "__main__":
    print("extracting arguments")
    parser = argparse.ArgumentParser()

    # hyperparameters sent by the client are passed as command-line arguments to the script.
    # to simplify the demo we don't use all sklearn RandomForest hyperparameters
    parser.add_argument("--n-estimators", type=int, default=10)
    parser.add_argument("--min-samples-leaf", type=int, default=3)

    # Data, model, and output directories
    parser.add_argument("--model-dir", type=str, default=os.environ.get("SM_MODEL_DIR"))
    parser.add_argument("--train", type=str, default=os.environ.get("SM_CHANNEL_TRAIN"))
    parser.add_argument("--test", type=str, default=os.environ.get("SM_CHANNEL_TEST"))
    parser.add_argument("--train-file", type=str, default="X_train.csv")
    parser.add_argument("--test-file", type=str, default="X_test.csv")
    parser.add_argument(
        "--features", type=str
    )  # in this script we ask user to explicitly name features
    parser.add_argument(
        "--target", type=str
    )  # in this script we ask user to explicitly name the target

    args, _ = parser.parse_known_args()

    print("reading data")
    train_df = pd.read_csv(os.path.join(args.train, args.train_file))
    test_df = pd.read_csv(os.path.join(args.test, args.test_file))

    print("building training and testing datasets")
    
    X_train = train_df[args.features.split()]
    X_test = test_df[args.features.split()]
    y_train = train_df[args.target]
    y_test = test_df[args.target]

    # train
    print("training model")
    model = RandomForestRegressor(
        n_estimators=args.n_estimators, min_samples_leaf=args.min_samples_leaf, n_jobs=-1
    )

    model.fit(X_train, y_train)

    # print abs error
    print("validating model")
    abs_err = np.abs(model.predict(X_test) - y_test)

    # print couple perf metrics
    for q in [10, 50, 90]:
        print("AE-at-" + str(q) + "th-percentile: " + str(np.percentile(a=abs_err, q=q)))

    # persist model
    path = os.path.join(args.model_dir, "model.joblib")
    joblib.dump(model, path)
    print("model persisted at " + path)
    print(args.min_samples_leaf)

In [None]:
def list_to_string(input_list):
    string = ''
    for item in input_list:
        string += f"{item} "
    return string.strip()


features = list_to_string(list(df)[:-1])

target = list(df)[-1]
features

In [None]:
! python script.py --n-estimators 100 \
                   --min-samples-leaf 2 \
                   --model-dir ./ \
                   --train ./ \
                   --test ./ \
                   --features 'Feature_1 Feature_2 Feature_3 Feature_4 Feature_5' \
                   --target {target}

### SageMaker Training

In [None]:
# We use the Estimator from the SageMaker Python SDK
from sagemaker.sklearn.estimator import SKLearn

FRAMEWORK_VERSION = "0.23-1"

sklearn_estimator = SKLearn(
    entry_point="script.py",
    role=get_execution_role(),
    instance_count=1,
    instance_type="ml.c5.xlarge",
    framework_version=FRAMEWORK_VERSION,
    base_job_name="rf-scikit",
    metric_definitions=[{"Name": "median-AE", "Regex": "AE-at-50th-percentile: ([0-9.]+).*$"}],
    hyperparameters={
        "n-estimators": 100,
        "min-samples-leaf": 3,
        "features": features,
        "target": target,
    },
)

# launch training job, with asynchronous call
sklearn_estimator.fit({"train": trainpath, "test": testpath}, wait=True)

In [None]:
sklearn_estimator.latest_training_job.wait(logs="None")
artifact = sm_boto3.describe_training_job(
    TrainingJobName=sklearn_estimator.latest_training_job.name
)["ModelArtifacts"]["S3ModelArtifacts"]

print("Model artifact persisted at " + artifact)

In [None]:
from sagemaker.sklearn.model import SKLearnModel

model = SKLearnModel(
    model_data=artifact,
    role=get_execution_role(),
    entry_point="inference_handler.py",
    framework_version=FRAMEWORK_VERSION,
)

In [None]:
from sagemaker.model_monitor import DataCaptureConfig
from sagemaker.utils import name_from_base

endpoint_name = name_from_base(prefix.replace('/', '-'))

s3_capture_upload_path = f"s3://{bucket}/{prefix}/data-capture"

# Specify either Input, Output or both. 
capture_modes = ['REQUEST','RESPONSE']

data_capture_config = DataCaptureConfig(
    enable_capture = True, 
    sampling_percentage = 100, # Optional
    destination_s3_uri = s3_capture_upload_path, # Optional
    capture_options = ["REQUEST", "RESPONSE"],
)

predictor = model.deploy(
    instance_type="ml.c5.large",
    initial_instance_count=1,
    endpoint_name=endpoint_name,
    data_capture_config=data_capture_config
)

In [None]:
import io
import json

# authenticating with AWS
runtime_sm_client = boto3.client(service_name="sagemaker-runtime")

data = testX[list(df)[:-1]]

csv_file = io.StringIO()

#loop over each row of pandas df and convert each row to json
for index, row in data.iterrows():
    row = row.to_list()
    payload = ",".join(str(x) for x in row)

    # invoking endpoint
    response = runtime_sm_client.invoke_endpoint(
        EndpointName=endpoint_name,
        Body=payload,
        Accept="text/csv",
        ContentType="text/csv",  # for csv 'application/x-npy' for numpy
    )
    
    result = json.loads(response["Body"].read())
    print(result)

### View captured data

In [None]:
import time

# the data capture may take a few seconds to appear
time.sleep(60)

s3_client = boto3.Session().client("s3")
current_endpoint_capture_prefix = f"{prefix}/data-capture/{endpoint_name}"

result = s3_client.list_objects(Bucket=bucket, Prefix=current_endpoint_capture_prefix)
capture_files = [capture_file.get("Key") for capture_file in result.get("Contents")]
print("Found Capture Files:")
print("\n ".join(capture_files))

In [None]:
def get_obj_body(obj_key):
    return s3_client.get_object(Bucket=bucket, Key=obj_key).get("Body").read().decode("utf-8")

capture_file = get_obj_body(capture_files[-1])
print(capture_file)

In [None]:
import json

print(json.dumps(json.loads(capture_file.split("\n")[0]), indent=2))

### Model Monitoring

In [None]:
from sagemaker.model_monitor import DefaultModelMonitor
from sagemaker.model_monitor.dataset_format import DatasetFormat
from sagemaker.model_monitor import CronExpressionGenerator
from monitoringjob_utils import run_model_monitor_job_processor

#Create a monitoring object
my_default_monitor = DefaultModelMonitor(
    role=get_execution_role(),
    instance_count=1,
    instance_type="ml.m5.xlarge",
    volume_size_in_gb=20,
    max_runtime_in_seconds=3600,
)

In [None]:
baseline_results_uri = f"s3://{bucket}/{prefix}/baseline"

In [None]:
#Start baseline job
my_default_monitor.suggest_baseline(
    baseline_dataset="X_train.csv",
    dataset_format=DatasetFormat.csv(header=True),
    output_s3_uri=baseline_results_uri,
    wait=True,
)

### Explore the generated constraints and statistics

In [None]:
s3_client = boto3.Session().client("s3")
result = s3_client.list_objects(Bucket=bucket, Prefix=f"{prefix}/baseline")
report_files = [report_file.get("Key") for report_file in result.get("Contents")]
print("Found Files:")
print("\n ".join(report_files))

In [None]:
import pandas as pd

baseline_job = my_default_monitor.latest_baselining_job
schema_df = pd.json_normalize(baseline_job.baseline_statistics().body_dict["features"])
schema_df.head(10)

In [None]:
constraints_df = pd.json_normalize(
    baseline_job.suggested_constraints().body_dict["features"]
)
constraints_df.head(10)

### Trigger job instantly

In [None]:
s3_stats = ''
s3_const = ""

for file in report_files:
    if "statistics" in file:
        s3_stats=f"s3://{bucket}/{file}"
    else:
        s3_const=f"s3://{bucket}/{file}"
print(s3_stats)
print(s3_const)

In [None]:
preprocess_prefix = f"{prefix}/preprocess"
preprocess_file = "preprocess_v8.py"

trainpath = sess.upload_data(
    path=preprocess_file, bucket=bucket, key_prefix=preprocess_prefix
)

preprocess_path = f"s3://{bucket}/{preprocess_prefix}/{preprocess_file}"
print(preprocess_path)

In [None]:
reports_path = name_from_base('reports')


processor = run_model_monitor_job_processor(
    region = region,
    instance_type = "ml.m5.xlarge",
    role = get_execution_role(),
    data_capture_path = f"{s3_capture_upload_path}/{endpoint_name}",
    statistics_path = s3_stats,
    constraints_path = s3_const,
    reports_path = f"s3://{bucket}/{prefix}/{reports_path}",
#     preprocessor_path=preprocess_path,
)

In [None]:
def get_latest_model_monitor_processing_job_name(base_job_name):
    client = boto3.client("sagemaker")
    response = client.list_processing_jobs(
        NameContains=base_job_name,
        SortBy="CreationTime",
        SortOrder="Descending",
        StatusEquals="Completed",
    )
    if len(response["ProcessingJobSummaries"]) > 0:
        return response["ProcessingJobSummaries"][0]["ProcessingJobName"]
    else:
        raise Exception("Processing job not found.")


def get_model_monitor_processing_job_s3_report(job_name):
    client = boto3.client("sagemaker")
    response = client.describe_processing_job(ProcessingJobName=job_name)
    s3_report_path = response["ProcessingOutputConfig"]["Outputs"][0]["S3Output"]["S3Uri"]
    return s3_report_path


MODEL_MONITOR_JOB_NAME = "sagemaker-model-monitor-analyzer"
latest_model_monitor_processing_job_name = get_latest_model_monitor_processing_job_name(
    MODEL_MONITOR_JOB_NAME
)
print(latest_model_monitor_processing_job_name)
report_path = get_model_monitor_processing_job_s3_report(latest_model_monitor_processing_job_name)
print(report_path)

In [None]:
result = s3_client.list_objects(Bucket=bucket, Prefix=f"{prefix}/{reports_path}")
report_files = [report_file.get("Key") for report_file in result.get("Contents")]
print("Found Files:")
print("\n ".join(report_files))

In [None]:
capture_file = get_obj_body(report_files[0])
print(capture_file)