# [Lab2] SageMaker Training

In [None]:
%store -r

In [None]:
import sagemaker
import boto3
from time import gmtime, strftime

boto_session = boto3.Session()
sess = sagemaker.Session()

## 1. Preparing for Training Job

### Dependencies

In [None]:
%mkdir -p './training/requirements/'

In [None]:
%%writefile './training/requirements/requirements.txt'
mlflow==2.13.2
sagemaker-mlflow==0.1.0

### Training Script

#### MLFlow AutoLog - https://mlflow.org/docs/latest/tracking/autolog.html

In [None]:
%%writefile ./training/train.py

import argparse
import json
import os
import xgboost as xgb
from sklearn.metrics import roc_auc_score
import mlflow
import mlflow.xgboost

def parse_args():
    parser = argparse.ArgumentParser()

    # Hyperparameters
    parser.add_argument('--max_depth', type=int, default=5)
    parser.add_argument('--eta', type=float, default=0.2)
    parser.add_argument('--gamma', type=float, default=4)
    parser.add_argument('--min_child_weight', type=float, default=6)
    parser.add_argument('--subsample', type=float, default=0.8)
    parser.add_argument('--objective', type=str, default='binary:logistic')
    parser.add_argument('--num_round', type=int, default=50)
    parser.add_argument('--verbosity', type=int, default=1)

    # SageMaker parameters
    parser.add_argument('--model_dir', type=str, default=os.environ.get('SM_MODEL_DIR'))
    parser.add_argument('--train', type=str, default=os.environ.get('SM_CHANNEL_TRAIN'))
    parser.add_argument('--validation', type=str, default=os.environ.get('SM_CHANNEL_VALIDATION'))

    return parser.parse_args()

if __name__ == '__main__':
    args = parse_args()

    mlflow.set_tracking_uri(os.getenv('MLFLOW_TRACKING_ARN'))

    # Enable XGBoost autologging
    mlflow.xgboost.autolog()

    # Load data
    dtrain = xgb.DMatrix(os.path.join(args.train, 'train.csv?format=csv&label_column=0'))
    dval = xgb.DMatrix(os.path.join(args.validation, 'validation.csv?format=csv&label_column=0'))

    params = {
        'max_depth': args.max_depth,
        'eta': args.eta,
        'gamma': args.gamma,
        'min_child_weight': args.min_child_weight,
        'subsample': args.subsample,
        'objective': args.objective,
        'verbosity': args.verbosity,
    }

    with mlflow.start_run(run_id=os.environ['MLFLOW_RUN_ID']):
        model = xgb.train(
            params=params,
            dtrain=dtrain,
            num_boost_round=args.num_round,
            evals=[(dval, 'validation')],
        )

        # Evaluate model
        train_preds = model.predict(dtrain)
        val_preds = model.predict(dval)

        train_auc = roc_auc_score(dtrain.get_label(), train_preds)
        val_auc = roc_auc_score(dval.get_label(), val_preds)

        mlflow.log_metrics({'train_auc': train_auc, 'val_auc': val_auc})

        # Save the model
        model_location = os.path.join(args.model_dir, 'xgboost-model')
        model.save_model(model_location)
        mlflow.log_artifact(model_location)

        print(f"Training completed. Model saved at {model_location}")
        print(f"Train AUC: {train_auc}, Validation AUC: {val_auc}")

## 2. Input / Output Settings

### Training Input

#### Example - Built-in algorithm with Script Mode

In [None]:
container = sagemaker.image_uris.retrieve(region=boto3.Session().region_name, framework='xgboost', version='1.7-1')

In [None]:
s3_input_train = sagemaker.inputs.TrainingInput(s3_data=train_path.format(bucket, prefix), content_type='csv')
s3_input_validation = sagemaker.inputs.TrainingInput(s3_data=validation_path.format(bucket, prefix), content_type='csv')

training_inputs = {'train': s3_input_train, 'validation': s3_input_validation}

### Hypeparameters

In [None]:
xgb1 = sagemaker.estimator.Estimator(image_uri=container,
                                    role=role, 
                                    instance_count=1, 
                                    instance_type='ml.m5.large',
                                    output_path='s3://{}/{}/output/'.format(bucket, prefix),
                                    base_job_name='training-job',
                                    sagemaker_session=sess)

xgb1.set_hyperparameters(max_depth=3,
                        eta=0.5,
                        gamma=4,
                        eval_metric="auc",
                        min_child_weight=6,
                        subsample=0.8,
                        verbosity=0,
                        objective='binary:logistic',
                        num_round=50)

xgb2 = sagemaker.estimator.Estimator(image_uri=container,
                                    role=role, 
                                    instance_count=1, 
                                    instance_type='ml.m5.large',
                                    output_path='s3://{}/{}/output/'.format(bucket, prefix),
                                    base_job_name='training-job',
                                    sagemaker_session=sess)

xgb2.set_hyperparameters(max_depth=3,
                        eta=0.1,
                        gamma=2,
                        eval_metric="auc",
                        min_child_weight=3,
                        subsample=0.4,
                        verbosity=0,
                        objective='binary:logistic',
                        num_round=100)

### MLFlow Setup

In [None]:
import mlflow

mlflow.set_tracking_uri(mlflow_arn)
experiment = mlflow.set_experiment(experiment_name=experiment_name)

## 3. Training Job

### Training Job

In [None]:
xgb1.fit(training_inputs,wait=False,logs=False) 

In [None]:
xgb2.fit(training_inputs,wait=True,logs=False) 

## Training in progress. Please stand by.

### Logging the training job

In [None]:
def load_model(model_data_s3_uri):
    import xgboost as xgb
    import tarfile
    import pickle as pkl

    model_file = "./xgboost-model.tar.gz"
    bucket, key = model_data_s3_uri.replace("s3://", "").split("/", 1)
    boto3.client("s3").download_file(bucket, key, model_file)
    
    with tarfile.open(model_file, "r:gz") as t:
        t.extractall(path=".")
    
    # Load model
    model = xgb.Booster()
    model.load_model("xgboost-model")

    return model
    
def run_logging(xgb_estimator, run_name):
    with mlflow.start_run(run_name=run_name):
        mlflow.log_params(xgb_estimator.hyperparameters())
        mlflow.set_tags(
            {
                'mlflow.user': user_profile_name,
                'mlflow.source.name': f'https://{region}.console.aws.amazon.com/sagemaker/home?region={region}#/jobs/{xgb_estimator.latest_training_job.name}',
                'mlflow.source.type': 'JOB'
            }
        )
        mlflow.log_param("training job name", xgb_estimator.latest_training_job.name)
        mlflow.log_param("model_origin", xgb_estimator.model_data)

        metrics = xgb_estimator.training_job_analytics.dataframe('auc')
        logged_metrics = {}
        for _, row in metrics.iterrows():
            if row['metric_name'] in ['train:auc', 'validation:auc']:
                logged_metrics[row['metric_name'].replace(':', '_')] = row['value']
        mlflow.log_metrics(logged_metrics)
        
        model = load_model(xgb_estimator.model_data)
        mlflow.xgboost.log_model(model, artifact_path="model")

In [None]:
test_name_1 = "hyperparam1"
run_name_1 = f"{test_name_1}-training-{strftime('%d-%H-%M-%S', gmtime())}"

test_name_2 = "hyperparam2"
run_name_2 = f"{test_name_2}-training-{strftime('%d-%H-%M-%S', gmtime())}"

In [None]:
run_logging(xgb1, run_name_1)
run_logging(xgb2, run_name_2)

In [None]:
%store run_name_1
%store run_name_2

## Checking the training results

In [None]:
last_run_id = mlflow.search_runs(
    experiment_ids=[mlflow.get_experiment_by_name(experiment_name).experiment_id], 
    max_results=1, 
    order_by=["attributes.start_time DESC"]
)['run_id'][0]

presigned_url = sess.sagemaker_client.create_presigned_mlflow_tracking_server_url(
    TrackingServerName=mlflow_name,
    ExpiresInSeconds=60,
    SessionExpirationDurationInSeconds=1800
)['AuthorizedUrl']

mlflow_run_link = f"{presigned_url.split('/auth')[0]}/#/experiments/1/runs/{last_run_id}"

In [None]:
from IPython.display import Javascript

display(Javascript('window.open("{}");'.format(mlflow_run_link)))

## Register the models in MLFlow

In [None]:
registered_model_name = f"model-{experiment_name}"

In [None]:
run_id_1 = mlflow.search_runs(
    experiment_ids=[mlflow.get_experiment_by_name(experiment_name).experiment_id], 
    max_results=2, 
    order_by=["attributes.start_time DESC"]
)['run_id'][1]

model_uri = f"runs:/{run_id_1}/model"

registered_model_version_1 = mlflow.register_model(model_uri, registered_model_name)

In [None]:
run_id_2 = mlflow.search_runs(
    experiment_ids=[mlflow.get_experiment_by_name(experiment_name).experiment_id], 
    max_results=2, 
    order_by=["attributes.start_time DESC"]
)['run_id'][0]

model_uri = f"runs:/{run_id_2}/model"

registered_model_version_2 = mlflow.register_model(model_uri, registered_model_name)

In [None]:
%store run_id_1
%store run_id_2
%store registered_model_version_1
%store registered_model_version_2

In [None]:
mlflow.end_run()