# ML Flow setup

In [2]:
%cd ..

/root/rl-market-simulator


In [2]:
!pip install -r src/requirements.txt

You should consider upgrading via the '/usr/local/bin/python3.8 -m pip install --upgrade pip' command.[0m


In [3]:
# Community libraries
import os
import re
import subprocess
import sys
import glob
import shutil
import logging
import argparse
import numpy as np
import pandas as pd
import mlflow
import matplotlib.pyplot as plt
from IPython.display import HTML

# AWS libraries
import boto3
import sagemaker
from sagemaker.analytics import TrainingJobAnalytics
from sagemaker.rl import RLEstimator, RLToolkit, RLFramework

# From custom code
from src.utils import global_parameters as gp

# RLEstimator dependencies
#sys.path.append("common")

# Logging definition
logging.basicConfig(level=logging.INFO)

# Function to download bucket contents for a specific directory
def download_s3_folder(bucket_name, s3_folder, local_dir=None):
    """
    Download the contents of a folder directory
    Args:
        bucket_name: the name of the s3 bucket
        s3_folder: the folder path in the s3 bucket
        local_dir: a relative or absolute directory path in the local file system
    """
    s3 = boto3.resource('s3') # assumes credentials & configuration are handled outside python in .aws directory or environment variables
    bucket = s3.Bucket(bucket_name)
    for obj in bucket.objects.filter(Prefix=s3_folder):
        target = obj.key if local_dir is None \
            else os.path.join(local_dir, os.path.relpath(obj.key, s3_folder))
        if not os.path.exists(os.path.dirname(target)):
            os.makedirs(os.path.dirname(target))
        if obj.key[-1] == '/':
            continue
        print(target)
        bucket.download_file(obj.key, target)

### Setup S3 bucket

Set up the linkage and authentication to the S3 bucket that you want to use for checkpoint and the metadata. 

In [4]:
sage_session = sagemaker.session.Session()
s3_bucket = sage_session.default_bucket()
s3_output_path = "s3://{}/".format(s3_bucket)
print("S3 bucket path: {}".format(s3_output_path))
role = sagemaker.get_execution_role()

S3 bucket path: s3://sagemaker-eu-central-1-961105418118/


# Tracking RL agent experiments

In [6]:
#DNS name of the Load Balancer that interacts with fargate cluster in which the MLflow server is installed 
tracking_uri = 'http://mll-mlflow-development-1-72adb7f6eb3c1c02.elb.eu-central-1.amazonaws.com'
rl_experiment_name = 'tfm-market-simulator-v3-sagemaker'

# create a descriptive job name
job_name_prefix = "rl-market-simulator"

# RL estimator hyperparamenters
hyperparameters = {
    'tracking_uri': tracking_uri,
    'experiment_name': rl_experiment_name,
    'training_iteration': gp.MAX_ITERATIONS,
    'gamma': 0.50,
    'horizon': gp.AGENT_MAX_STEPS,
    'lr': 0.001
}

# Ray image
ray_tf_image = "462105765813.dkr.ecr.eu-central-1.amazonaws.com/sagemaker-rl-ray-container:ray-1.6.0-tf-cpu-py37"

# RL estimator metrics
metric_definitions = [{'Name': 'episode_reward_mean',
                      'Regex': 'episode_reward_mean: ([-+]?[0-9]*[.]?[0-9]+([eE][-+]?[0-9]+)?)'},
                     {'Name': 'episode_reward_max',
                      'Regex': 'episode_reward_max: ([-+]?[0-9]*[.]?[0-9]+([eE][-+]?[0-9]+)?)'},
                     {'Name': 'episode_reward_min',
                      'Regex': 'episode_reward_min: ([-+]?[0-9]*[.]?[0-9]+([eE][-+]?[0-9]+)?)'},
                     {'Name': 'training_iteration',
                      'Regex': 'training_iteration: ([-+]?[0-9]*[.]?[0-9]+([eE][-+]?[0-9]+)?)'},
                     {'Name': 'entropy',
                      'Regex': 'entropy: ([-+]?[0-9]*[.]?[0-9]+([eE][-+]?[0-9]+)?)'}]

# ML FLow Tags definition
mlflow_tags = {"businessunit": "machinelearninglab",
        "subunit": "machinelearninglab",
        "application": "mll-tfm-market-simulator",
        "account": "mll-dev",
        "env": "dev",
        "service": "mll-tfm-market-simulator-model-repository",
        "version": "3.0.0",
        "contact": "General - Machine Learning Lab <a44e37f5.TUIGroup.onmicrosoft.com@emea.teams.ms>",
        "classification": "internal"}

# Training without tuning

In [None]:
# Definition of the estimator
estimator = RLEstimator(base_job_name=job_name_prefix,
                        entry_point='./models/train-rl-market-simulator.py',
                        source_dir= 'src',
                        #image_name=ray_tf_image,
                        dependencies=["src/common/sagemaker_rl"],
                        toolkit=RLToolkit.RAY,
                        framework=RLFramework.TENSORFLOW,
                        toolkit_version="1.6.0",
                        role=role,
                        debugger_hook_config=False,
                        instance_type='ml.m5.large',
                        instance_count=1,
                        output_path=s3_output_path,
                        use_spot_instances=True, # use spot instance
                        max_run = 3600, # seconds
                        max_wait = 3600, # seconds
                        hyperparameters = hyperparameters,
                        metric_definitions=metric_definitions,
                        )

# set remote mlflow server
mlflow.set_tracking_uri(tracking_uri)
mlflow.set_experiment(rl_experiment_name)

# start mlflow run
with mlflow.start_run():

    # Set the run tags
    mlflow.set_tags(mlflow_tags)
    
    # ML Flow algorith hyperparameters logging 
    logging.info('Logging input hyperparameters')
    mlflow.log_params(hyperparameters)

    # fit the RL model
    logging.info('Train the reinforcement learning agent.')
    estimator.fit(wait=True)
    
    # get the job name
    job_name = estimator.latest_training_job.job_name
    print("Training job: %s" % job_name)
    
    # log the job name associated to this run
    logging.info('Logging input hyperparameters')
    mlflow.log_params({"training_job_name": job_name})

    # Log metrics
    progress_df = pd.DataFrame()
    for metric in ['episode_reward_mean','episode_reward_max','episode_reward_min']:
        df = TrainingJobAnalytics(job_name, [metric]).dataframe()
        df = df.rename(columns={"value":metric})
        progress_df = pd.concat([progress_df, df[metric]], axis=1)
        for value in df[metric]:
            mlflow.log_metric(metric, value)
    
    # Log progress csv
    progress_df.to_csv(f'progress.csv')
    mlflow.log_artifact("progress.csv")
    
    # Log model artifacts
    logging.info(f'Get training artifacts from {estimator.model_data}')
    download_s3_folder(s3_bucket,job_name,'models')
    mlflow.log_artifact("models")
    
    # end mlflow run
    mlflow.end_run()

INFO:root:Logging input hyperparameters
INFO:root:Train the reinforcement learning agent.
INFO:sagemaker.image_uris:Defaulting to the only supported framework/algorithm version: latest.
INFO:sagemaker.image_uris:Ignoring unnecessary instance type: None.
INFO:sagemaker.image_uris:Defaulting to only available Python version: py37
INFO:sagemaker:Creating training-job with name: rl-market-simulator-2022-10-03-14-30-21-288


2022-10-03 14:30:22 Starting - Starting the training job...
2022-10-03 14:30:38 Starting - Preparing the instances for trainingProfilerReport-1664807421: InProgress
......
2022-10-03 14:31:44 Downloading - Downloading input data......
2022-10-03 14:32:44 Training - Downloading the training image...
2022-10-03 14:33:16 Training - Training image download completed. Training in progress..[34mbash: cannot set terminal process group (-1): Inappropriate ioctl for device[0m
[34mbash: no job control in this shell[0m
[34m2022-10-03 14:33:21.135504: W tensorflow/core/profiler/internal/smprofiler_timeline.cc:460] Initializing the SageMaker Profiler.[0m
[34m2022-10-03 14:33:21.142657: W tensorflow/core/profiler/internal/smprofiler_timeline.cc:105] SageMaker Profiler is not enabled. The timeline writer thread will not be started, future recorded events will be dropped.[0m
[34m2022-10-03 14:33:21.303324: W tensorflow/core/profiler/internal/smprofiler_timeline.cc:460] Initializing the SageMa

# Training without hyperparameter tuning

In [None]:
from sagemaker.tuner import ContinuousParameter, HyperparameterTuner
    
hyperparameters = {
    'tracking_uri': tracking_uri,
    'experiment_name': rl_experiment_name,
}

rl_hyperparameter_ranges = {
    'gamma': ContinuousParameter(0.01, 0.99),
    'lr': ContinuousParameter(0.0001, 0.001)
}

In [None]:
# check source https://sagemaker-examples.readthedocs.io/en/latest/reinforcement_learning/rl_roboschool_ray/rl_roboschool_ray_automatic_model_tuning.html
# for further clarification
tuner = HyperparameterTuner(estimator,
                            objective_metric_name='episode_reward_mean',
                            objective_type='Maximize',
                            hyperparameter_ranges=rl_hyperparameter_ranges,
                            metric_definitions=metric_definitions,
                            max_jobs=5,
                            max_parallel_jobs=5,
                            base_tuning_job_name='tfm-market-simulator-tuning')

# set remote mlflow server
mlflow.set_tracking_uri(tracking_uri)
mlflow.set_experiment(rl_experiment_name)

with mlflow.start_run():
    
    # Set the run tags
    mlflow.set_tags(mlflow_tags)
    
    # ML Flow algorith hyperparameters logging 
    #logging.info('Logging tuning hyperparameters')
    #mlflow.log_params(rl_hyperparameter_ranges)
    
    # fit the RL model with tuning
    logging.info('Train the reinforcement learning agent.')
    tuner.fit(wait=True)
    
    # ML Flow algorith hyperparameters logging 
    logging.info('Logging completed run parameters')
    mlflow.log_params(tuner.describe()['BestTrainingJob'])
    
    # get the BEST training run
    job_name = tuner.best_training_job()
    print("Training job: %s" % job_name)
    
    # Log metrics
    progress_df = pd.DataFrame()
    for metric in ['episode_reward_mean','episode_reward_max','episode_reward_min']:
        df = TrainingJobAnalytics(job_name, [metric]).dataframe()
        df = df.rename(columns={"value":metric})
        progress_df = pd.concat([progress_df, df[metric]], axis=1)
        for value in df[metric]:
            mlflow.log_metric(metric, value)
            
    # Log progress csv
    progress_df.to_csv(f'progress.csv')
    mlflow.log_artifact("progress.csv")
            
    # Log model artifacts
    logging.info(f'Get training artifacts from training job={job_name}')
    download_s3_folder(s3_bucket,job_name,'models')
    mlflow.log_artifact("models")
    
    # end mlflow run
    mlflow.end_run()

## Model deployment

Now let us deploy the RL policy so that we can get the optimal action, given an environment observation.

**Note**: Model deployment is supported for TensorFLow only at current stage. 

STOP HERE IF PYTORCH IS USED.

In [None]:
best_model = tuner.best_estimator().model_data
best_model


2022-09-30 09:52:34 Starting - Preparing the instances for training
2022-09-30 09:52:34 Downloading - Downloading input data
2022-09-30 09:52:34 Training - Training image download completed. Training in progress.
2022-09-30 09:52:34 Uploading - Uploading generated training model
2022-09-30 09:52:34 Completed - Training job completed


's3://sagemaker-eu-central-1-961105418118/tfm-market-simulator-220930-0944-001-0c6f4dac/output/model.tar.gz'

In [191]:
# model tar.gz example
model_to_deploy_example = "s3://sagemaker-eu-central-1-961105418118/1rznlhvopllp-tfm-v3--d5nkL8QAcd-003-122dbef7/output/model.tar.gz"

In [33]:
# download model
download_s3_folder(s3_bucket, 'tfm-market-simulator-220930-0944-001-0c6f4dac/output', local_dir='tmp')

tmp/intermediate/training/.tmp_checkpoint
tmp/intermediate/training/.tmp_generator
tmp/intermediate/training/PPO_myEnv_24abc_00000_0_2022-09-30_09-49-14/checkpoint_000003/.is_checkpoint
tmp/intermediate/training/PPO_myEnv_24abc_00000_0_2022-09-30_09-49-14/checkpoint_000003/checkpoint-3
tmp/intermediate/training/PPO_myEnv_24abc_00000_0_2022-09-30_09-49-14/checkpoint_000003/checkpoint-3.tune_metadata
tmp/intermediate/training/PPO_myEnv_24abc_00000_0_2022-09-30_09-49-14/events.out.tfevents.1664531355.algo-1
tmp/intermediate/training/PPO_myEnv_24abc_00000_0_2022-09-30_09-49-14/params.json
tmp/intermediate/training/PPO_myEnv_24abc_00000_0_2022-09-30_09-49-14/params.pkl
tmp/intermediate/training/PPO_myEnv_24abc_00000_0_2022-09-30_09-49-14/progress.csv
tmp/intermediate/training/PPO_myEnv_24abc_00000_0_2022-09-30_09-49-14/result.json
tmp/model.tar.gz


In [41]:
os.makedirs(target)

In [62]:
# download model tar.gz
s3 = boto3.resource('s3')
bucket = s3.Bucket(s3_bucket)
target = 'tmp'
# check if target exists
if not os.path.exists(os.path.dirname(target)):
            os.makedirs(target)
bucket.download_file("tfm-market-simulator-220930-0944-001-0c6f4dac/output/model.tar.gz", "tmp/model.tar.gz")

In [192]:
from sagemaker.tensorflow.model import TensorFlowModel
model = TensorFlowModel(model_data=model_to_deploy_example, framework_version="2.5.1", role=role)
predictor = model.deploy(initial_instance_count=1, instance_type='ml.m5.xlarge')

See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.
INFO:sagemaker:Creating model with name: tensorflow-inference-2022-10-14-16-18-06-304
INFO:sagemaker:Creating endpoint with name tensorflow-inference-2022-10-14-16-18-06-942


----!

In [199]:
# ray 1.6.0 requires all the following inputs, ray 0.8.5 or below remove 'timestep'
# 'prev_action', 'is_training', 'prev_reward', 'seq_lens' and 'timestep' are placeholders for this example
# they won't affect prediction results

# Number of different values stored in at any time in the current state for the TFM market simulator.
TFM_BOOKING_CURVE_NUMBER_FEATURES = 21

# Sample predictions with random outputs
number_of_examples = 10
testing_data = np.random.rand(number_of_examples, TFM_BOOKING_CURVE_NUMBER_FEATURES).tolist()

input = {
    "inputs": {
        "observations": testing_data,
        "prev_action": [1],
        "is_training": False,
        "prev_reward": -1,
        "seq_lens": -1,
        "timestep": 1,
    }
}

result = predictor.predict(input)

result["outputs"]["actions_0"]

[10, 10, 10, 10, 10, 10, 10, 10, 10, 10]

In [11]:
np.random.rand(1, 21).tolist()

[[0.576083629471641,
  0.1900841856662263,
  0.028175621159361564,
  0.35442517173266674,
  0.9054375392123268,
  0.8158723542405849,
  0.15997761845254854,
  0.6055418081810722,
  0.17907020240222793,
  0.7489194207620015,
  0.8850696660372761,
  0.23267985741174269,
  0.9441477586953245,
  0.9597605804961318,
  0.7941387314539569,
  0.9369093157914256,
  0.8481229877121451,
  0.0545036211139488,
  0.7318125273327869,
  0.8735593658281017,
  0.49502695985182277]]

### Delete endpoint

In [200]:
predictor.delete_endpoint()

INFO:sagemaker:Deleting endpoint configuration with name: tensorflow-inference-2022-10-14-16-18-06-942
INFO:sagemaker:Deleting endpoint with name: tensorflow-inference-2022-10-14-16-18-06-942


# Deploy endpoint as mlflow

In [50]:
from src.utils.quality_tests import QAforTFM
tfm_qa = QAforTFM(rl_training_job = 'fejh754xz0ob-tfm-v3--ISGzUj7dZ3-010-3d789f30', 
                  rl_artifacts_bucket = s3_bucket)

In [51]:
# extract rl model tar.gz
model_tar_path = 'tmp'
tfm_qa.extract_rl_model(model_tar_path)

In [52]:
# evaluate markups
prediction_data = pd.read_csv('s3://mll-mlflow-development-961105418118/15/3616a7af66be4a1782290e2e6606fc4b/artifacts/processed_x_test_data.csv')
tfm_qa.markups_unique_values(model_tar_path, prediction_data)

array([ 0. ,  0.5,  1. ,  1.5,  2. ,  2.5,  3. ,  3.5,  4. ,  4.5,  5. ,
        5.5,  6. ,  6.5,  7. ,  7.5,  8. ,  8.5,  9. ,  9.5, 10. , 10.5,
       11. , 11.5, 12. , 12.5, 13. , 13.5, 14. , 14.5, 15. , 15.5, 16. ,
       16.5, 17. , 17.5, 18. , 18.5, 19. , 19.5, 20. , 20.5, 21. , 21.5,
       22. , 22.5, 23. , 23.5, 24. , 24.5, 25. , 25.5, 26. , 26.5, 27. ,
       27.5, 28. , 28.5, 29. , 29.5, 30. , 30.5, 31. , 31.5, 32. , 32.5,
       33. , 33.5, 34. , 34.5, 35. , 35.5, 36. , 36.5, 37. , 37.5, 38. ,
       38.5, 39. , 39.5, 40. , 40.5, 41. , 41.5, 42. , 42.5, 43. , 43.5,
       44. , 44.5, 45. , 45.5, 46. , 46.5, 47. , 47.5, 48. , 48.5, 49. ,
       49.5, 50. , 50.5, 51. , 51.5, 52. , 52.5, 53. , 53.5, 54. , 54.5,
       55. , 55.5, 56. , 56.5, 57. , 57.5, 58. , 58.5, 59. , 59.5])

In [6]:
# download model tar.gz
s3 = boto3.resource('s3')
bucket = s3.Bucket(s3_bucket)
target = 'tmp'
selected_training_rl_job = 'efkphzjch6z1-tfm-v3--UQkqq2HymU-005-97ff9096' # select the rl training run

# check if target exists
if not os.path.exists(os.path.dirname(target)):
            os.makedirs(target)
bucket.download_file(f"{selected_training_rl_job}/output/model.tar.gz", f"{target}/model_new.tar.gz")

In [7]:
# importing the "tarfile" module
import tarfile
  
# open file
file = tarfile.open(f"{target}/model_new.tar.gz")
  
# extracting file
file.extractall(f"{target}/model")
  
file.close()

In [8]:
import tensorflow as tf

filepath = f"{target}/model"
rl_model = tf.saved_model.load(f'{filepath}/1')

In [9]:
print(list(rl_model.signatures.keys()))

['serving_default']


In [10]:
infer = rl_model.signatures['serving_default']
print(infer.structured_outputs)

{'action_prob': <tf.Tensor 'default_policy/Exp:0' shape=(None,) dtype=float32>, 'vf_preds': <tf.Tensor 'default_policy/Reshape:0' shape=(None,) dtype=float32>, 'actions_0': <tf.Tensor 'default_policy/cond_1/Merge:0' shape=(None,) dtype=int64>, 'action_dist_inputs': <tf.Tensor 'default_policy/model/fc_out/BiasAdd:0' shape=(None, 120) dtype=float32>, 'action_logp': <tf.Tensor 'default_policy/cond_2/Merge:0' shape=(None,) dtype=float32>}


In [11]:
from tensorflow.python.saved_model import signature_constants

In [12]:
print(tf.__version__)

2.6.2


In [13]:
tag=[tf.saved_model.SERVING]
tag

['serve']

In [14]:
key=signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY
key

'serving_default'

In [5]:
# set remote mlflow server
mlflow.set_tracking_uri("http://mll-mlflow-development-1-72adb7f6eb3c1c02.elb.eu-central-1.amazonaws.com/")
mlflow.set_experiment("tfm-tensorflow-test")

tag = ['serve']
key = ['serving_default']

# test tensorflow logging
with mlflow.start_run():
    
    mlflow.tensorflow.log_model(#tf_saved_model_dir=f'{filepath}/1',
                         tf_saved_model_dir=f'tmp',
                         tf_signature_def_key = key,
                         tf_meta_graph_tags = tag,
                         artifact_path="model",
                         registered_model_name="test-tfm")

    
    # end mlflow run
    mlflow.end_run()

NameError: name 'mlflow' is not defined

In [223]:
# load tensorflow model
logged_model = 'runs:/6d5da4e4fd5141ed87b70a2f0e5b1dde/model'

# Load model as a PyFuncModel.
loaded_model = mlflow.tensorflow.load_model(logged_model)
loaded_model

<ConcreteFunction pruned(is_training, observations, prev_action, prev_reward, timestep) at 0x7F6697BA30A0>

In [227]:
# example with one prediction
TFM_BOOKING_CURVE_NUMBER_FEATURES = 21
prediction = loaded_model(is_training = tf.convert_to_tensor(False), 
                     observations = tf.convert_to_tensor(np.ones(shape=(1, TFM_BOOKING_CURVE_NUMBER_FEATURES)).tolist()), 
                     prev_action = tf.constant(0, dtype=tf.int64), 
                     prev_reward = tf.constant(100, dtype=tf.float32), 
                     timestep=tf.cast(1.0, tf.int64))
markup = np.multiply(prediction['actions_0'].numpy(), 0.5)
markup

array([16.])

In [234]:
# read for the predictions
data = pd.read_csv('s3://mll-mlflow-development-961105418118/15/3616a7af66be4a1782290e2e6606fc4b/artifacts/processed_x_test_data.csv')
data.head()
TFM_BOOKING_CURVE_NUMBER_FEATURES = len(data.columns)

In [235]:
# make predictions
prediction = loaded_model(is_training = tf.convert_to_tensor(False), 
                     observations = tf.convert_to_tensor(data.to_numpy().tolist()), 
                     prev_action = tf.constant(0, dtype=tf.int64), 
                     prev_reward = tf.constant(100, dtype=tf.float32), 
                     timestep=tf.cast(1.0, tf.int64))
markup = np.multiply(prediction['actions_0'].numpy(), 0.5)
markup

array([14., 14., 14., ..., 14., 14., 14.])

In [236]:
np.unique(markup)

array([14., 16.])

In [238]:
reference = 16
len([value for value in markup if value == reference])

9412

# Get best run from ML Flow

In [7]:
# required libraries
import os
import boto3
import warnings
import sagemaker
import numpy as np
import pandas as pd
import json
import pathlib
from pandas.io.json import json_normalize
from scipy import stats

import logging
import mlflow
from mlflow.tracking import MlflowClient
from mlflow.entities import ViewType
from mlflow.tracking.artifact_utils import _download_artifact_from_uri
from mlflow.tracking._tracking_service.client import TrackingServiceClient
from mlflow.store.entities.paged_list import PagedList
from mlflow.entities import Experiment, Run, RunInfo, Param, Metric, RunTag, FileInfo, ViewType

# Function to get mlflow runs
def update_run_df(run: PagedList[Run])->pd.DataFrame():
    df=pd.DataFrame.from_records(list(run)).T
    df.columns=df.iloc[0]
    df=df[1:]
    return df

In [59]:
from datetime import datetime, timedelta
today_date = datetime.strftime(datetime.now() - timedelta(1), '%Y-%m-%d')
#today_date = datetime.today().strftime('%Y-%m-%d')
today_date

'2022-10-20'

In [68]:
tracking_uri = 'http://mll-mlflow-development-1-72adb7f6eb3c1c02.elb.eu-central-1.amazonaws.com//'
experiment_name = 'tfm-v3-rlestimator+mlflow+pipelines'
mlflow_client = TrackingServiceClient(tracking_uri)
experiment_id=dict(mlflow_client.get_experiment_by_name(experiment_name))['experiment_id']

# get best model for todays runs based on the reward mean
runs = mlflow_client.search_runs(experiment_id, 
                                 filter_string=f"parameters.training_job_date = '{today_date}'", 
                                 order_by=["metrics.episode_reward_mean DESC"], max_results=1)
best_run = runs[0]

# get mlflow run id
best_run.info.run_id

'd045a85aa3db487daab84334d438526e'

In [48]:
runs = mlflow_client.search_runs(experiment_id, "", order_by=["metrics.episode_reward_mean DESC"], max_results=1)
best_run = runs[0]
best_run

1666262274013

In [4]:
# required parameters
tracking_uri = 'http://mll-mlflow-development-1-72adb7f6eb3c1c02.elb.eu-central-1.amazonaws.com//'
experiment_name = 'tfm-v3-rlestimator+mlflow+pipelines'

# Get mlflow data
mlflow_client = TrackingServiceClient(tracking_uri)
experiment_id=dict(mlflow_client.get_experiment_by_name(experiment_name))['experiment_id']

# Get mlflow results
runs=pd.DataFrame()
for run in mlflow_client.list_run_infos(experiment_id,run_view_type=ViewType.ACTIVE_ONLY):
    runs=pd.concat([runs,update_run_df(run)],axis=0)

# Remove unfinished runs
runs = runs[runs['status']=='FINISHED']

# get the most recent run that finished successfully
run_id = runs['run_id'][runs['end_time']==max(runs['end_time'])].values[0]
print('Run ID:', run_id)

# Build model uri
model_uri = f'runs:/{run_id}/model'

# Build evaluation folder
output_dir = "/opt/ml/processing/evaluation"
pathlib.Path(output_dir).mkdir(parents=True, exist_ok=True)

# Download the artifact to local storage.
local_path = mlflow_client.download_artifacts(run_id, "progress.csv", output_dir)
print("Artifacts downloaded in: {}".format(output_dir))

# Read progress data
df_progress = pd.read_csv(local_path)

# get sample split indexes
first_split, second_split = int(df_progress.shape[0]*0.50), int(df_progress.shape[0]*0.75)
results = stats.ttest_ind(df_progress['episode_reward_mean'][first_split:second_split], df_progress['episode_reward_mean'][second_split:])
print(f"P-value={results.pvalue}")

if results.pvalue < 0.05:
    print(f"Reject null hypothesis since {results.pvalue}<0.05")
else:
    print(f"Training reward mean has converged")

Run ID: d414237f97ed4ae5aca24b363c034792
Artifacts downloaded in: /opt/ml/processing/evaluation
P-value=0.17288628613038395
Training reward mean has converged


In [5]:
runs[runs['status']=='FINISHED']

Unnamed: 0,artifact_uri,end_time,experiment_id,lifecycle_stage,run_id,run_uuid,start_time,status,user_id
1,s3://mll-mlflow-development-961105418118/13/d4...,1666219460738,13,active,d414237f97ed4ae5aca24b363c034792,d414237f97ed4ae5aca24b363c034792,1666217023176,FINISHED,root
1,s3://mll-mlflow-development-961105418118/13/f3...,1666219233783,13,active,f3f3b565ac784167a6d3b49c413b4ce7,f3f3b565ac784167a6d3b49c413b4ce7,1666217022167,FINISHED,root
1,s3://mll-mlflow-development-961105418118/13/67...,1666219356130,13,active,67975a7920a941e2a653ebd49d273f2f,67975a7920a941e2a653ebd49d273f2f,1666217015649,FINISHED,root
1,s3://mll-mlflow-development-961105418118/13/14...,1666219373563,13,active,149e8ccc16914abb98b37679474b2192,149e8ccc16914abb98b37679474b2192,1666217015071,FINISHED,root
1,s3://mll-mlflow-development-961105418118/13/d2...,1666219451087,13,active,d22bf2bbdcf841f2848e3e012dea2684,d22bf2bbdcf841f2848e3e012dea2684,1666217012407,FINISHED,root
...,...,...,...,...,...,...,...,...,...
1,s3://mll-mlflow-development-961105418118/13/60...,1664894323625,13,active,609f531744534003bf7bacb9247a7091,609f531744534003bf7bacb9247a7091,1664894123329,FINISHED,root
1,s3://mll-mlflow-development-961105418118/13/c6...,1664894315520,13,active,c642181f78764417b9cdb959044854ee,c642181f78764417b9cdb959044854ee,1664894118424,FINISHED,root
1,s3://mll-mlflow-development-961105418118/13/cd...,1664894296628,13,active,cd53de0440784875a9c536ce2fed32a1,cd53de0440784875a9c536ce2fed32a1,1664894113896,FINISHED,root
1,s3://mll-mlflow-development-961105418118/13/2a...,1664894263682,13,active,2a6493845ad44402b38e960f3ad79136,2a6493845ad44402b38e960f3ad79136,1664894112718,FINISHED,root


In [36]:
data = omlflow._tracking_client.get_run(run_id).data

In [38]:
data.metrics['training_recall_score']

0.9313119830869503

In [39]:
run_id

'042acd62196741ca9fda2ff28943fffb'

# Testing ENDPOINT deployment

In [3]:
import os
import sys
import yaml
import logging
import argparse

from src.model_deploy.utils import MLflowHandler


# CONFIG
with open("cfg/model_deploy.yaml") as f:
    config = yaml.load(f, Loader=yaml.SafeLoader)
config

{'model': {'name': 'tfm-v3-model-test',
  'version': 11,
  'tracking_uri': 'http://mll-mlflow-development-1-72adb7f6eb3c1c02.elb.eu-central-1.amazonaws.com/',
  'location_ssm_parameter': '/tfm/model/location',
  'location_ssm_parameter_tf': '/tfm3/model/location'},
 'endpoint': {'instance_type': 'ml.m5.xlarge',
  'instance_count': 1,
  'image_uri': '961105418118.dkr.ecr.eu-central-1.amazonaws.com/mlflow-pyfunc:1.27.0',
  'image_uri_tf': '763104351884.dkr.ecr.eu-central-1.amazonaws.com/tensorflow-inference:2.5.1-cpu-py37-ubuntu18.04'}}

In [4]:
# INSTANTIATE MLFLOW HANDLER
mlflow_handler = MLflowHandler(cfg=config)

In [5]:
mlflow_handler.prepare_sagemaker_model()

model.tar.gz dowloaded from /tmp/tmpx1shtpc7/model/model.tar.gz
model.tar.gz uploaded to s3://sagemaker-eu-central-1-961105418118/mlflow_model/tfm-v3-model-test-11/model.tar.gz


In [9]:
mlflow_handler.transition_model_version_stage("Staging")

In [10]:
mlflow_handler.transition_model_version_stage("Production")

# Build Tensorflow container with custom input and output handler

In [2]:
%cd ..

/root/rl-market-simulator


In [5]:
# Test tensorflow model deployment
import sagemaker
from sagemaker.tensorflow import TensorFlowModel

model = TensorFlowModel(model_data='s3://mll-mlflow-development-961105418118/13/cd258ca110b6459e8e9fd7c985237c5f/artifacts/model/model.tar.gz', 
                        entry_point='src/model_deploy/inference.py', 
                        image_uri='763104351884.dkr.ecr.eu-central-1.amazonaws.com/tensorflow-inference:2.5.1-cpu-py37-ubuntu18.04',
                        role=sagemaker.get_execution_role())

predictor = model.deploy(initial_instance_count=1, 
                         instance_type='ml.c5.xlarge', 
                         endpoint_name='tfm-v3-custom-endpoint-test-2')

update_endpoint is a no-op in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.


----!

In [11]:
# Making predictions
input = {
  'currency': 'euro',
  'vatPercentage': '23',
  'supplierPriceAdult': 100
}
predictor.predict(input)

{'totalMarkup': {'adult': 59.0, 'child': 59.0, 'infant': 0.0},
 'vatPercentage': '23',
 'currency': 'euro',
 'markupDetails': [{'adult': 59.0,
   'child': 59.0,
   'infant': 0.0,
   'ruleId': 'TFM-MarginBrain-RL'}],
 'markupMethod': 'TFM-MarginBrain-RL',
 'supplierPrice': {'adult': 100.0, 'child': 100.0, 'infant': 0.0}}

# Build SKlearn container with custom input and output handler

In [1]:
%cd ..

/root/rl-market-simulator


In [None]:
# Test sklearn model deployment
import os
import sagemaker
import boto3
from sagemaker.sklearn.model import SKLearnModel
from sagemaker.model import Model

# get sagemaker session
session = sagemaker.Session(boto3.Session(region_name='eu-central-1'))

In [3]:
# PREPROCESSOR MODEL
preprocessing_model = SKLearnModel(
    model_data='s3://sagemaker-eu-central-1-961105418118/Process-TFM-V3-data-90d887376bc6007c57cb35b905cc713e/output/feat_transform/preprocessing_model.tar.gz',
    role=sagemaker.get_execution_role(),
    sagemaker_session=session,
    source_dir='src/model_deploy',
    entry_point='preprocessing_inference.py',
    framework_version='0.23-1',
)

In [6]:
predictor = preprocessing_model.deploy(initial_instance_count=1, 
                         instance_type='ml.c5.xlarge', 
                         endpoint_name='tfm-v3-preprocessing-endpoint-test-v23')

-----!

In [9]:
import boto3
client = boto3.client('sagemaker-runtime')
content_type = "application/json"
request_body = {
    "SOURCE":{"0":"AMADEUS-MPIS"},
    "JOURNEY_TYPE":{"0":"ROUNDTRIP"},
    "DEPARTURE_CLUSTER":{"0":"ES"},
    "DESTINATION_CLUSTER":{"0":"City"},
    "ttd_cluster":{"0":"A"},
    "OUT_ELAPSED_FLIGHT_TIME":{"0":140},
    "NUM_PAX":{"0":3},
    "TOTAL_MARKUP_AMOUNT_PER_PAX":{"0":15},
    "DAYSTODEP":{"0":15},
}
endpoint_name = "tfm-v3-preprocessing-endpoint-test-v23"
print(request_body)

import json
data = json.loads(json.dumps(request_body))
payload = json.dumps(data)
response = client.invoke_endpoint(
    EndpointName=endpoint_name,
    ContentType=content_type,
    Body=payload)
result = json.loads(response['Body'].read().decode())
result

{'SOURCE': {'0': 'AMADEUS-MPIS'}, 'JOURNEY_TYPE': {'0': 'ROUNDTRIP'}, 'DEPARTURE_CLUSTER': {'0': 'ES'}, 'DESTINATION_CLUSTER': {'0': 'City'}, 'ttd_cluster': {'0': 'A'}, 'OUT_ELAPSED_FLIGHT_TIME': {'0': 140}, 'NUM_PAX': {'0': 3}, 'TOTAL_MARKUP_AMOUNT_PER_PAX': {'0': 15}, 'DAYSTODEP': {'0': 15}}


{'features': ['JOURNEY_TYPE',
  'DEPARTURE_CLUSTER',
  'DESTINATION_CLUSTER',
  'ttd_cluster',
  'AF',
  'AMADEUS-MPIS',
  'AMADEUS-MPTB',
  'EK',
  'FLX',
  'KL',
  'XQ',
  'LONG_HAUL',
  'SHORT_HAUL',
  'DAYSTODEP',
  'NUM_PAX',
  'TOTAL_MARKUP_AMOUNT_PER_PAX'],
 'tfm_v3_data': [['ROUNDTRIP',
   'ES',
   'City',
   'A',
   0.0,
   1.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   1.0,
   0.04620462046204621,
   2.0,
   0.7692307692307692]]}

# Local deployment
This has to be done outside sagemaker studio!
Before we can get started we have the usual SageMaker imports to get our environment ready.

In [2]:
%cd ..

/root/rl-market-simulator


In [3]:
import boto3
import json
import os
import joblib
import pickle
import tarfile
import sagemaker
from sagemaker.estimator import Estimator
import time
from time import gmtime, strftime
import subprocess

#Setup
boto_session = boto3.session.Session()
s3 = boto_session.resource('s3')
region = boto_session.region_name
role = sagemaker.get_execution_role()

# SageMaker Local Mode Serving
First we create a SageMaker Local Session, this is essentially telling SageMaker we’re working in Local Mode.

In [4]:
import sagemaker
from sagemaker.local import LocalSession
from sagemaker.sklearn import SKLearn, SKLearnModel

session = LocalSession()
print(type(session))

<class 'sagemaker.local.local_session.LocalSession'>


Next we configure our SKLearn SageMaker estimator with Local Mode enabled.

In [5]:
session.config = {'local': {'local_code': True}}

model = SKLearnModel(
    entry_point='src/model_deploy/preprocessing_inference.py',
    role=role,
    model_data='s3://sagemaker-eu-central-1-961105418118/Process-TFM-V3-data-66e680777166619f25191b8fcfb2ed7f/output/feat_transform/preprocessing_model.tar.gz',
    framework_version='0.23-1',
)

In [6]:
json_payload = '{"SOURCE":{"0":"AMADEUS-MPIS"},"JOURNEY_TYPE":{"0":"ROUNDTRIP"},"OUT_ELAPSED_FLIGHT_TIME":{"0":140},"TOTAL_PRICE":{"0":220.57},"DEPARTURE_CLUSTER":{"0":"ES"},"DESTINATION_CLUSTER":{"0":"City"}}'

try:
    predictor = model.deploy(initial_instance_count=1, instance_type='local')
    print(predictor)
    preds = predictor.predict(payload)
    print(preds)
except Exception as e:
    print(e)

'docker-compose' is not installed. Local Mode features will not work without docker-compose. For more information on how to install 'docker-compose', please, see https://docs.docker.com/compose/install/


In [8]:
model.deploy(initial_instance_count=1, instance_type='local')

ImportError: 'docker-compose' is not installed. Local Mode features will not work without docker-compose. For more information on how to install 'docker-compose', please, see https://docs.docker.com/compose/install/