# [Lab2] SageMaker Training

In [1]:
%store -r

In [2]:
import sagemaker
import boto3
from time import gmtime, strftime

boto_session = boto3.Session()
sess = sagemaker.Session()

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/sagemaker-user/.config/sagemaker/config.yaml


## 1. Preparing for Training Job (Built-in algorithm)

### Training Input

In [3]:
container = sagemaker.image_uris.retrieve(region=boto3.Session().region_name, framework='xgboost', version='1.7-1')

In [4]:
s3_input_train = sagemaker.inputs.TrainingInput(s3_data=train_path.format(bucket, prefix), content_type='csv')
s3_input_validation = sagemaker.inputs.TrainingInput(s3_data=validation_path.format(bucket, prefix), content_type='csv')

training_inputs = {'train': s3_input_train, 'validation': s3_input_validation}

## 2. Training Job Definition - Built-in Algorithm

In [5]:
xgb1 = sagemaker.estimator.Estimator(image_uri=container,
                                    role=role, 
                                    instance_count=1, 
                                    instance_type='ml.m5.large',
                                    output_path='s3://{}/{}/output/'.format(bucket, prefix),
                                    base_job_name='training-job',
                                    sagemaker_session=sess)

xgb1.set_hyperparameters(max_depth=3,
                        eta=0.5,
                        gamma=4,
                        eval_metric="auc",
                        min_child_weight=6,
                        subsample=0.8,
                        verbosity=0,
                        objective='binary:logistic',
                        num_round=50)

xgb2 = sagemaker.estimator.Estimator(image_uri=container,
                                    role=role, 
                                    instance_count=1, 
                                    instance_type='ml.m5.large',
                                    output_path='s3://{}/{}/output/'.format(bucket, prefix),
                                    base_job_name='training-job',
                                    sagemaker_session=sess)

xgb2.set_hyperparameters(max_depth=3,
                        eta=0.1,
                        gamma=2,
                        eval_metric="auc",
                        min_child_weight=3,
                        subsample=0.4,
                        verbosity=0,
                        objective='binary:logistic',
                        num_round=100)

## 3. Logging

### MLFlow Setup

In [6]:
import mlflow

mlflow.set_tracking_uri(mlflow_arn)
mlflow.set_experiment(experiment_name=experiment_name)

<Experiment: artifact_location='s3://sagemaker-ap-northeast-2-185567426878/mlflow/21-06-07-33/2', creation_time=1729494869387, experiment_id='2', last_update_time=1729494869387, lifecycle_stage='active', name='end-to-end-experiment-21-07-13-50', tags={}>

### Training Job

In [7]:
xgb1.fit(training_inputs,wait=False,logs=False) 

INFO:sagemaker:Creating training-job with name: training-job-2024-10-21-13-30-18-998


In [8]:
xgb2.fit(training_inputs,wait=True,logs=False) 

INFO:sagemaker:Creating training-job with name: training-job-2024-10-21-13-30-42-668


## Training in progress. Please stand by.

### Storing Parameters / Logs / Models

In [9]:
def load_model(model_data_s3_uri):
    import xgboost as xgb
    import tarfile
    import pickle as pkl

    model_file = "./xgboost-model.tar.gz"
    bucket, key = model_data_s3_uri.replace("s3://", "").split("/", 1)
    boto3.client("s3").download_file(bucket, key, model_file)
    
    with tarfile.open(model_file, "r:gz") as t:
        t.extractall(path=".")
    
    # Load model
    model = xgb.Booster()
    model.load_model("xgboost-model")

    return model
    
def run_logging(xgb_estimator, run_name):
    with mlflow.start_run(run_name=run_name):
        mlflow.log_params(xgb_estimator.hyperparameters())
        mlflow.set_tags(
            {
                'mlflow.user': user_profile_name,
                'mlflow.source.name': f'https://{region}.console.aws.amazon.com/sagemaker/home?region={region}#/jobs/{xgb_estimator.latest_training_job.name}',
                'mlflow.source.type': 'JOB'
            }
        )
        mlflow.log_param("training job name", xgb_estimator.latest_training_job.name)
        mlflow.log_param("model_origin", xgb_estimator.model_data)

        metrics = xgb_estimator.training_job_analytics.dataframe('auc')
        logged_metrics = {}
        for _, row in metrics.iterrows():
            if row['metric_name'] in ['train:auc', 'validation:auc']:
                logged_metrics[row['metric_name'].replace(':', '_')] = row['value']
        mlflow.log_metrics(logged_metrics)
        
        model = load_model(xgb_estimator.model_data)
        mlflow.xgboost.log_model(model, artifact_path="model")

In [10]:
test_name_1 = "hyperparam1"
run_name_1 = f"{test_name_1}-training-{strftime('%d-%H-%M-%S', gmtime())}"

test_name_2 = "hyperparam2"
run_name_2 = f"{test_name_2}-training-{strftime('%d-%H-%M-%S', gmtime())}"

In [11]:
run_logging(xgb1, run_name_1)
run_logging(xgb2, run_name_2)



In [12]:
%store run_name_1
%store run_name_2

Stored 'run_name_1' (str)
Stored 'run_name_2' (str)


## 4. Checking the training results

In [13]:
last_run_id = mlflow.search_runs(
    experiment_ids=[mlflow.get_experiment_by_name(experiment_name).experiment_id], 
    max_results=1, 
    order_by=["attributes.start_time DESC"]
)['run_id'][0]

presigned_url = sess.sagemaker_client.create_presigned_mlflow_tracking_server_url(
    TrackingServerName=mlflow_name,
    ExpiresInSeconds=60,
    SessionExpirationDurationInSeconds=1800
)['AuthorizedUrl']

mlflow_run_link = f"{presigned_url.split('/auth')[0]}/#/experiments/1/runs/{last_run_id}"

In [15]:
from IPython.display import Javascript

display(Javascript('window.open("{}");'.format(mlflow_run_link)))

<IPython.core.display.Javascript object>

## 5. Register the models in MLFlow

In [16]:
registered_model_name = f"model-{experiment_name}"

In [17]:
run_id_1 = mlflow.search_runs(
    experiment_ids=[mlflow.get_experiment_by_name(experiment_name).experiment_id], 
    max_results=2, 
    order_by=["attributes.start_time DESC"]
)['run_id'][1]

model_uri = f"runs:/{run_id_1}/model"

registered_model_version_1 = mlflow.register_model(model_uri, registered_model_name)

Registered model 'model-end-to-end-experiment-21-07-13-50' already exists. Creating a new version of this model...
2024/10/21 13:40:25 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: model-end-to-end-experiment-21-07-13-50, version 3
Created version '3' of model 'model-end-to-end-experiment-21-07-13-50'.


In [18]:
run_id_2 = mlflow.search_runs(
    experiment_ids=[mlflow.get_experiment_by_name(experiment_name).experiment_id], 
    max_results=2, 
    order_by=["attributes.start_time DESC"]
)['run_id'][0]

model_uri = f"runs:/{run_id_2}/model"

registered_model_version_2 = mlflow.register_model(model_uri, registered_model_name)

Registered model 'model-end-to-end-experiment-21-07-13-50' already exists. Creating a new version of this model...
2024/10/21 13:40:27 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: model-end-to-end-experiment-21-07-13-50, version 4
Created version '4' of model 'model-end-to-end-experiment-21-07-13-50'.


In [19]:
%store run_id_1
%store run_id_2
%store registered_model_version_1
%store registered_model_version_2

Stored 'run_id_1' (str)
Stored 'run_id_2' (str)
Stored 'registered_model_version_1' (ModelVersion)
Stored 'registered_model_version_2' (ModelVersion)


In [20]:
mlflow.end_run()

## [Optional] 6. Training Job - Script Mode 

### Dependencies

In [21]:
%mkdir -p './training'

In [22]:
%%writefile './training/requirements.txt'
mlflow==2.13.2
sagemaker-mlflow==0.1.0

Writing ./training/requirements.txt


### Training Script

In [23]:
%%writefile ./training/train.py

import argparse
import json
import logging
import os
import pandas as pd
import pickle as pkl

from sagemaker_containers import entry_point
from sagemaker_xgboost_container.data_utils import get_dmatrix
from sagemaker_xgboost_container import distributed

from sklearn.metrics import roc_auc_score

import xgboost as xgb
import mlflow

from time import gmtime, strftime

suffix = strftime('%d-%H-%M-%S', gmtime())

user_profile_name = os.getenv('USER', 'sagemaker')
experiment_name = os.getenv('MLFLOW_EXPERIMENT_NAME')
region = os.getenv('REGION')

mlflow.set_tracking_uri(os.getenv('MLFLOW_TRACKING_ARN'))
mlflow.set_experiment(experiment_name=experiment_name if experiment_name else f"train-{suffix}")

def _xgb_train(params, dtrain, dval, evals, num_boost_round, model_dir, is_master):
    """Run xgb train on arguments given with rabit initialized.

    This is our rabit execution function.

    :param args_dict: Argument dictionary used to run xgb.train().
    :param is_master: True if current node is master host in distributed training,
                        or is running single node training job.
                        Note that rabit_run includes this argument.
    """
    booster = xgb.train(
        params=params,
        dtrain=dtrain,
        evals=evals,
        num_boost_round=num_boost_round
    )

    val_auc = roc_auc_score(dval.get_label(), booster.predict(dval))
    train_auc = roc_auc_score(dtrain.get_label(), booster.predict(dtrain))
    mlflow.log_params(params)
    mlflow.log_metrics({"validation_auc":val_auc, "train_auc":train_auc})
    # emit training metrics - SageMaker collects them from the log stream
    print(f"[0]#011train-auc:{train_auc}#011validation-auc:{val_auc}")
    
    if is_master:
        model_location = model_dir + '/xgboost-model'
        pkl.dump(booster, open(model_location, 'wb'))
        print("Stored trained model at {}".format(model_location))


if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    
    # Hyperparameters are described here.
    parser.add_argument('--max_depth', type=int)
    parser.add_argument('--eta', type=float)
    parser.add_argument('--alpha', type=float)
    parser.add_argument('--gamma', type=int)
    parser.add_argument('--min_child_weight', type=float)
    parser.add_argument('--subsample', type=float)
    parser.add_argument('--colsample_bytree', type=float)
    parser.add_argument('--verbosity', type=int)
    parser.add_argument('--objective', type=str)
    parser.add_argument('--num_round', type=int)
    parser.add_argument('--early_stopping_rounds', type=int)
    parser.add_argument('--tree_method', type=str, default="auto")
    parser.add_argument('--predictor', type=str, default="auto")

    # Sagemaker specific arguments. Defaults are set in the environment variables.
    parser.add_argument('--output_data_dir', type=str, default=os.environ.get('SM_OUTPUT_DATA_DIR'))
    parser.add_argument('--model_dir', type=str, default=os.environ.get('SM_MODEL_DIR'))
    parser.add_argument('--train', type=str, default=os.environ.get('SM_CHANNEL_TRAIN'))
    parser.add_argument('--validation', type=str, default=os.environ.get('SM_CHANNEL_VALIDATION'))
    parser.add_argument('--sm_hosts', type=str, default=os.environ.get('SM_HOSTS'))
    parser.add_argument('--sm_current_host', type=str, default=os.environ.get('SM_CURRENT_HOST'))
    parser.add_argument('--sm_training_env', type=str, default=os.environ.get('SM_TRAINING_ENV'))
    
    print("main function")
    args, _ = parser.parse_known_args()

    # Get SageMaker host information from runtime environment variables
    sm_hosts = json.loads(args.sm_hosts)
    sm_current_host = args.sm_current_host
    dtrain = get_dmatrix(args.train, 'CSV')
    dval = get_dmatrix(args.validation, 'CSV')

    watchlist = [(dtrain, 'train'), (dval, 'validation')] if dval is not None else [(dtrain, 'train')]

    # get SageMaker enviroment setup
    sm_training_env = json.loads(args.sm_training_env)
    
    # enable auto logging
    mlflow.xgboost.autolog(log_model_signatures=False, log_datasets=False)

    train_hp = {
        'max_depth': args.max_depth,
        'eta': args.eta,
        'gamma': args.gamma,
        'min_child_weight': args.min_child_weight,
        'subsample': args.subsample,
        'verbosity': args.verbosity,
        'objective': args.objective,
        'tree_method': args.tree_method,
        'predictor': args.predictor,
    }

    xgb_train_args = dict(
        params=train_hp,
        dtrain=dtrain,
        dval=dval,
        evals=watchlist,
        num_boost_round=args.num_round,
        model_dir=args.model_dir)

    with mlflow.start_run(
        run_name=f"container-training-{suffix}",
        description="xgboost running in SageMaker container in script mode"
    ) as run:

        mlflow.set_tags(
            {
                'mlflow.user':user_profile_name,
                'mlflow.source.type':'JOB',
                'mlflow.source.name': f"https://{region}.console.aws.amazon.com/sagemaker/home?region={region}#/jobs/{sm_training_env['job_name']}" if sm_training_env['current_host'] != 'sagemaker-local' else sm_training_env['current_host']
            }
        )
    
        if len(sm_hosts) > 1:
            # Wait until all hosts are able to find each other
            entry_point._wait_hostname_resolution()
    
            # Execute training function after initializing rabit.
            distributed.rabit_run(
                exec_fun=_xgb_train,
                args=xgb_train_args,
                include_in_training=(dtrain is not None),
                hosts=sm_hosts,
                current_host=sm_current_host,
                update_rabit_args=True
            )
        else:
            # If single node training, call training method directly.
            if dtrain:
                xgb_train_args['is_master'] = True
                _xgb_train(**xgb_train_args)
            else:
                raise ValueError("Training channel must have data to train model.")

# Return model object
def model_fn(model_dir):
    """Deserialize and return fitted model.

    Note that this should have the same name as the serialized model in the _xgb_train method
    """
    model_file = 'xgboost-model'
    booster = pkl.load(open(os.path.join(model_dir, model_file), 'rb'))
    return booster

Overwriting ./training/train.py


In [25]:
hyperparams = {
    'num_round': 50,
    'max_depth': 3,
    'eta': 0.5,
    'alpha': 2.5,
    'objective': 'binary:logistic',
    'eval_metric': 'auc',
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'min_child_weight': 3,
    'early_stopping_rounds': 10,
    'verbosity': 1
}

env_variables = {
    'MLFLOW_TRACKING_ARN': mlflow_arn,
    'MLFLOW_EXPERIMENT_NAME': experiment_name,
    'USER': user_profile_name,
    'REGION': region,
}


In [26]:
from sagemaker.xgboost.estimator import XGBoost

xgb_script_mode = XGBoost(
    entry_point='train.py',
    source_dir='./training',
    framework_version="1.7-1",  
    hyperparameters=hyperparams,
    role=role,
    instance_count=1, 
    instance_type='ml.m5.large',
    output_path='s3://{}/{}/output/'.format(bucket, prefix),
    base_job_name="script-training-job",
    environment=env_variables,
)

INFO:sagemaker.image_uris:Ignoring unnecessary Python version: py3.
INFO:sagemaker.image_uris:Ignoring unnecessary instance type: ml.m5.large.


In [27]:
xgb_script_mode.fit(training_inputs,wait=True,logs=False) 

INFO:sagemaker:Creating training-job with name: script-training-job-2024-10-21-13-48-32-829



2024-10-21 13:48:34 Starting - Starting the training job..
2024-10-21 13:48:49 Starting - Preparing the instances for training....
2024-10-21 13:49:17 Downloading - Downloading input data.......
2024-10-21 13:49:57 Downloading - Downloading the training image............
2024-10-21 13:51:03 Training - Training image download completed. Training in progress......
2024-10-21 13:51:33 Uploading - Uploading generated training model..
2024-10-21 13:51:47 Completed - Training job completed


In [28]:
last_run_id = mlflow.search_runs(
    experiment_ids=[mlflow.get_experiment_by_name(experiment_name).experiment_id], 
    max_results=1, 
    order_by=["attributes.start_time DESC"]
)['run_id'][0]

presigned_url = sess.sagemaker_client.create_presigned_mlflow_tracking_server_url(
    TrackingServerName=mlflow_name,
    ExpiresInSeconds=60,
    SessionExpirationDurationInSeconds=1800
)['AuthorizedUrl']

mlflow_run_link = f"{presigned_url.split('/auth')[0]}/#/experiments/1/runs/{last_run_id}"

In [29]:
from IPython.display import Javascript

display(Javascript('window.open("{}");'.format(mlflow_run_link)))

<IPython.core.display.Javascript object>