# Train a Cornac Model using Script Mode

## Setup Sagemaker

In [1]:
# %pip install -U sagemaker

In [2]:
# %pip install sagemaker-experiments

In [3]:
import boto3
import pandas as pd
import numpy as np
import time
import sagemaker
from sagemaker.session import Session
from sagemaker.feature_store.feature_group import FeatureGroup

In [4]:
%env AWS_PROFILE=aeroxye-sagemaker

env: AWS_PROFILE=aeroxye-sagemaker


In [5]:
!aws sts get-caller-identity

{
    "UserId": "AROAWC4YSIQL5OBFCNGEX:botocore-session-1687237823",
    "Account": "418542404631",
    "Arn": "arn:aws:sts::418542404631:assumed-role/SageMaker-UserRole/botocore-session-1687237823"
}


In [6]:
try:
    role = sagemaker.get_execution_role()
except ValueError:
    iam = boto3.client('iam')
    role = iam.get_role(RoleName='SageMaker-UserRole')['Role']['Arn']

region = boto3.Session().region_name
print(f'Current region: {region}')

boto_session = boto3.Session(region_name=region)
sagemaker_session = sagemaker.Session(boto_session=boto_session)
sagemaker_client = boto_session.client(service_name='sagemaker', region_name=region)
sagemaker_client.list_feature_groups()

featurestore_runtime = boto_session.client(service_name='sagemaker-featurestore-runtime', region_name=region)

feature_store_session = Session(
    boto_session=boto_session,
    sagemaker_client=sagemaker_client,
    sagemaker_featurestore_runtime_client=featurestore_runtime
)

Couldn't call 'get_role' to get Role ARN from role name SageMaker-UserRole to get Role path.


Current region: ap-southeast-1


In [7]:
# users = pd.DataFrame()
users_feature_group = FeatureGroup(name="users-feature-group", sagemaker_session=sagemaker_session)

# get single record from user
record_identifier_value = str("079b0ec9-cec6-42fb-9f00-7891c52a10fb")
featurestore_runtime.get_record(FeatureGroupName="users-feature-group",
                                RecordIdentifierValueAsString=record_identifier_value)

{'ResponseMetadata': {'RequestId': '4a02fa6c-9573-4f42-9b73-cc68ca26f87f',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': '4a02fa6c-9573-4f42-9b73-cc68ca26f87f',
   'content-type': 'application/json',
   'content-length': '1661',
   'date': 'Tue, 20 Jun 2023 05:38:18 GMT'},
  'RetryAttempts': 0},
 'Record': [{'FeatureName': 'id',
   'ValueAsString': '079b0ec9-cec6-42fb-9f00-7891c52a10fb'},
  {'FeatureName': 'has_other_cats', 'ValueAsString': '0'},
  {'FeatureName': 'personality', 'ValueAsString': 'all sweet'},
  {'FeatureName': 'gender', 'ValueAsString': 'no preference'},
  {'FeatureName': 'good_with_other_dogs', 'ValueAsString': '0'},
  {'FeatureName': 'employment', 'ValueAsString': 'working full time'},
  {'FeatureName': 'created_at', 'ValueAsString': '2023-05-18T17:13:45Z'},
  {'FeatureName': 'agree_to_fee', 'ValueAsString': '1'},
  {'FeatureName': 'is_first_cat', 'ValueAsString': '1'},
  {'FeatureName': 'good_with_kids', 'ValueAsString': '0'},
  {'FeatureName': 'att

## Train model
The model is trained using the SageMaker SDK's Estimator class. Firstly, get the execution role for training. This role allows us to access the S3 bucket in the last step, where the train and test data set is located.

In [8]:
# TODO: add code to retrieve latest train/test split based on type of split
split_type = "strat"
object_name = "strat-2023-06-16-07-09-28-233"
train_uri = f"s3://petfinder6000-training/{object_name}/output/train/train.csv"
eval_uri = f"s3://petfinder6000-training/{object_name}/output/validation/validation.csv"
test_uri = f"s3://petfinder6000-training/{object_name}/output/test/test.csv"

In [9]:
def get_uri_instance(run_mode):
    if run_mode == 'LOCAL':
        image_uri = 'cornac-39' # can pull remote container from ECR too
        instance_type = 'local'
    else:
        image_uri = '418542404631.dkr.ecr.ap-southeast-1.amazonaws.com/petfinder6000:cornac-39-v2'
        instance_type = 'ml.m5.xlarge'
    return image_uri, instance_type

In [11]:
# Docs: https://sagemaker.readthedocs.io/en/stable/frameworks/tensorflow/using_tf.html
# https://github.com/aws/amazon-sagemaker-examples/blob/main/sagemaker-experiments/sagemaker_job_tracking/tensorflow_script_mode_training_job.ipynb

from sagemaker.tensorflow import TensorFlow
from sagemaker.inputs import TrainingInput
from sagemaker.experiments.run import Run

exp_name = 'testing2'
run_name = 'bpr-' + time.strftime("%Y%m%d-%H-%M-%S", time.gmtime())
job_prefix = "petfinder6000-test/training"

with Run(experiment_name=exp_name, run_name=run_name, sagemaker_session=sagemaker_session) as run:
    # setup hyperparameters
    k = 50
    max_iter = 200
    learning_rate = 0.001
    lambda_reg = 0.001
    run.log_parameter("k", k)
    run.log_parameter("max_iter", max_iter)
    run.log_parameter("learning_rate", learning_rate)
    run.log_parameter("lambda_reg", lambda_reg)

    run_mode = 'LOCAL'
    image_uri, instance_type = get_uri_instance(run_mode)

    estimator = TensorFlow(
        image_uri=image_uri,
        entry_point="train_bpr.py",
        source_dir="./training",
        dependencies=[
            './training/metrics/harmonic_mean.py',
            './training/metrics/combined_eval_method.py',
            './training/metrics/serendipity_wrapper.py'
        ],
        role=role,
        instance_count=1,
        instance_type=instance_type,
        base_job_name=job_prefix,
        environment={"REGION": region, "EXP_NAME": exp_name, "RUN_NAME": run_name},
        hyperparameters={
            "k": k,
            "max_iter": max_iter,
            "learning_rate": learning_rate,
            "lambda_reg": lambda_reg,
        },
    )

    # train estimator
    estimator.fit(
        inputs={
            "train": TrainingInput(s3_data=train_uri, content_type="text/csv"),
            "eval": TrainingInput(s3_data=eval_uri, content_type="text/csv"),
        },
        job_name=f"{job_prefix}/{run_name}",
        wait=False
    )

INFO:botocore.credentials:Found credentials in shared credentials file: ~/.aws/credentials
INFO:sagemaker:Creating training-job with name: petfinder6000-test/training/bpr-20230620-05-39-00
INFO:sagemaker.local.local_session:Starting training job
INFO:sagemaker.local.image:docker compose file: 
networks:
  sagemaker-local:
    name: sagemaker-local
services:
  algo-1-gxjd8:
    command: train
    container_name: dsw22y960o-algo-1-gxjd8
    environment:
    - '[Masked]'
    - '[Masked]'
    - '[Masked]'
    - '[Masked]'
    - '[Masked]'
    - '[Masked]'
    - '[Masked]'
    - '[Masked]'
    image: cornac-39
    networks:
      sagemaker-local:
        aliases:
        - algo-1-gxjd8
    stdin_open: true
    tty: true
    volumes:
    - C:\Users\yongr\AppData\Local\Temp\tmpmect1da_\algo-1-gxjd8\output/data:/opt/ml/output/data
    - C:\Users\yongr\AppData\Local\Temp\tmpmect1da_\algo-1-gxjd8\output:/opt/ml/output
    - C:\Users\yongr\AppData\Local\Temp\tmpmect1da_\algo-1-gxjd8\input:/opt/ml

Container dsw22y960o-algo-1-gxjd8  Creating
Container dsw22y960o-algo-1-gxjd8  Created
Attaching to dsw22y960o-algo-1-gxjd8
dsw22y960o-algo-1-gxjd8  | 2023-06-20 05:39:08,744 botocore.credentials INFO     Found credentials in environment variables.
dsw22y960o-algo-1-gxjd8  | 2023-06-20 05:39:09,083 sagemaker-training-toolkit INFO     Installing dependencies from requirements.txt:
dsw22y960o-algo-1-gxjd8  | /usr/local/bin/python -m pip install -r requirements.txt
dsw22y960o-algo-1-gxjd8  | Collecting sagemaker-training==4.5.0
dsw22y960o-algo-1-gxjd8  |   Downloading sagemaker_training-4.5.0.tar.gz (58 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m58.1/58.1 kB[0m [31m3.2 MB/s[0m eta [36m0:00:00[0m eta [36m-:--:--[0m
dsw22y960o-algo-1-gxjd8  | [?25h  Preparing metadata (setup.py) ... [?25ldone
dsw22y960o-algo-1-gxjd8  | [?25hCollecting sagemaker-experiments==0.1.45
dsw22y960o-algo-1-gxjd8  |   Downloading sagemaker_experiments-0.1.45-py3-none-any.wh

INFO:root:creating C:\Users\yongr\AppData\Local\Temp\tmpmect1da_\artifacts\output\data
INFO:root:copying C:\Users\yongr\AppData\Local\Temp\tmpmect1da_\algo-1-gxjd8\output\data\CornacExp-2023-06-20_05-47-40-352874.log -> C:\Users\yongr\AppData\Local\Temp\tmpmect1da_\artifacts\output\data
INFO:root:copying C:\Users\yongr\AppData\Local\Temp\tmpmect1da_\algo-1-gxjd8\output\success -> C:\Users\yongr\AppData\Local\Temp\tmpmect1da_\artifacts\output
INFO:root:copying C:\Users\yongr\AppData\Local\Temp\tmpmect1da_\model\model.pkl -> C:\Users\yongr\AppData\Local\Temp\tmpmect1da_\artifacts\model


dsw22y960o-algo-1-gxjd8 exited with code 0
Aborting on container exit...
Container dsw22y960o-algo-1-gxjd8  Stopping
Container dsw22y960o-algo-1-gxjd8  Stopped
===== Job Complete =====


### Evaluate Model

In [None]:
from sagemaker.workflow.properties import PropertyFile
from sagemaker.processing import FrameworkProcessor, ProcessingInput, ProcessingOutput
from sagemaker.sklearn import SKLearn

evaluation_prefix = f"{job_prefix}/evaluation"
report_path = f"{evaluation_prefix}/report"
model_path = estimator.model_data.replace("\\","/")
print(f"Model is stored in: {model_path}")

evaluate_model_processor = FrameworkProcessor(
    role=role,
    image_uri=image_uri,
    base_job_name=evaluation_prefix,
    estimator_cls=SKLearn,
    framework_version='0.23-1',
    command=["python3"],
    instance_count=1,
    instance_type=instance_type,
    sagemaker_session=sagemaker_session,
)

evaluation_report = PropertyFile(
    name="EvaluationReport", output_name="evaluation", path="evaluation.json"
)

eval_args = evaluate_model_processor.run(
    inputs=[
        ProcessingInput(source=model_path, destination="/opt/ml/processing/model"),
        ProcessingInput(source=train_uri, destination="/opt/ml/processing/train"),
        ProcessingInput(source=test_uri, destination="/opt/ml/processing/test"),
    ],
    outputs=[
        ProcessingOutput(output_name="evaluation", source="/opt/ml/processing/evaluation", destination=report_path),
    ],
    code="evaluate.py",
    source_dir="./evaluation",
    dependencies=[
        './evaluation/metrics/harmonic_mean.py',
        './evaluation/metrics/combined_eval_method.py',
        './evaluation/metrics/serendipity_wrapper.py'
    ],
    job_name=evaluation_prefix,
)

### Register Trained Model in Model Registry

In [12]:
from sagemaker.tensorflow import TensorFlowModel
from sagemaker.model_metrics import MetricsSource, ModelMetrics

inference_instance_type = "ml.m5.xlarge"
ecr_image = '418542404631.dkr.ecr.ap-southeast-1.amazonaws.com/petfinder6000:cornac-39-inference-v13'

print(f"Location of model: {model_path}")
model = TensorFlowModel(
    image_uri=ecr_image,
    source_dir="./inference",
    model_data=model_path,
    role=role,
    sagemaker_session=sagemaker_session
)

evaluation_s3_uri = f"s3://{report_path}/output/evaluation/evaluation.json"

model_metrics = ModelMetrics(
    model_statistics=MetricsSource(
        s3_uri=evaluation_s3_uri,
        content_type="application/json",
    )
)

model_package_group_name = "TestModelPackageGroup"
model_package = model.register(
    content_types=["text/csv"],
    response_types=["text/csv"],
    inference_instances=[inference_instance_type],
    transform_instances=[inference_instance_type],
    model_metrics=model_metrics,
    model_package_group_name=model_package_group_name,
    approval_status="Approved",
)

model_package_arn = model_package.model_package_arn
print("Model Package ARN : ", model_package_arn)

Location of model: s3://sagemaker-ap-southeast-1-418542404631/petfinder6000/training/bpr-20230619-07-00-09/model.tar.gz
Model Package ARN :  arn:aws:sagemaker:ap-southeast-1:418542404631:model-package/testmodelpackagegroup/19


## Cleanup

### Delete experiments

In [45]:
from sagemaker.experiments.experiment import Experiment

experiment_name = "tensorflow-script-mode-experiment"
exp = Experiment.load(experiment_name=experiment_name, sagemaker_session=sagemaker_session)
print(exp)
exp._delete_all(action="--force")

Experiment(sagemaker_session=<sagemaker.session.Session object at 0x0000026ECD0BC670>,experiment_name='tensorflow-script-mode-experiment',experiment_arn='arn:aws:sagemaker:ap-southeast-1:418542404631:experiment/tensorflow-script-mode-experiment',display_name='tensorflow-script-mode-experiment',creation_time=datetime.datetime(2023, 6, 9, 22, 44, 58, 124000, tzinfo=tzlocal()),created_by={'UserProfileArn': 'arn:aws:sagemaker:ap-southeast-1:418542404631:user-profile/d-ni9jmwq8akiv/aeroxye', 'UserProfileName': 'aeroxye', 'DomainId': 'd-ni9jmwq8akiv'},last_modified_time=datetime.datetime(2023, 6, 10, 16, 30, 12, 866000, tzinfo=tzlocal()),last_modified_by={},response_metadata={'RequestId': 'd6609acb-9f9e-4700-97f8-235f2b7ffa68', 'HTTPStatusCode': 200, 'HTTPHeaders': {'x-amzn-requestid': 'd6609acb-9f9e-4700-97f8-235f2b7ffa68', 'content-type': 'application/x-amz-json-1.1', 'content-length': '472', 'date': 'Wed, 14 Jun 2023 08:22:31 GMT'}, 'RetryAttempts': 0})
