In [1]:
%load_ext autoreload
%autoreload 2

import os
import sys
from pathlib import Path
import warnings

warnings.filterwarnings('ignore')
CODE_FOLDER = Path("code")
CODE_FOLDER.mkdir(parents=True, exist_ok=True)

sys.path.append(f"./{CODE_FOLDER}")

In [2]:
# !aws s3api create-bucket --bucket football-data-kamil --create-bucket-configuration LocationConstraint=eu-north-1

In [3]:
%load_ext autoreload
%autoreload 2
%load_ext dotenv
%dotenv

import sys
import logging
import ipytest
from pathlib import Path


CODE_FOLDER = Path("code")
CODE_FOLDER.mkdir(parents=True, exist_ok=True)
INFERENCE_CODE_FOLDER = CODE_FOLDER / "inference"
INFERENCE_CODE_FOLDER.mkdir(parents=True, exist_ok=True)

sys.path.extend([f"./{CODE_FOLDER}", f"./{INFERENCE_CODE_FOLDER}"])

ipytest.autoconfig(raise_on_error=True)

# By default, The SageMaker SDK logs events related to the default
# configuration using the INFO level. To prevent these from spoiling
# the output of this notebook cells, we can change the logging
# level to ERROR instead.
logging.getLogger("sagemaker.config").setLevel(logging.ERROR)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [4]:
import os
import logging

from sagemaker.workflow.pipeline_context import PipelineSession, LocalPipelineSession

# Update this variable to your bucket name. This name must be unique
# across all AWS accounts.
BUCKET = os.environ["BUCKET"]
S3_LOCATION = f"s3://{BUCKET}/football"

# To run this notebook in Local Model, this constant must be set to True.
# I'm trying to do this automatically by checking for a specific environment
# variable that is set by SageMaker when you run the notebook inside SageMaker
# Studio. 
# LOCAL_MODE = "SAGEMAKER_INTERNAL_IMAGE_URI" not in os.environ

LOCAL_MODE = True

# This variable will be used to determine the architecture of the
# local machine. If the machine is an ARM64 machine, you will need
# to build a custom Docker image using the setup notebook.
ARCHITECTURE = !(uname -m)

# This is a dummy role that will be ignored when we run the
# pipeline in Local Mode.
DUMMY_ROLE = "arn:aws:iam::111111111111:role/service-role/AmazonSageMaker-ExecutionRole-11111111111111"

# We'll use these two variables to configure the steps that do not support
# Local Mode.
pipeline_session = PipelineSession(default_bucket=BUCKET) if not LOCAL_MODE else LocalPipelineSession(default_bucket=BUCKET)
execution_role = os.environ["ROLE"] if not LOCAL_MODE else DUMMY_ROLE

if LOCAL_MODE:
    config = {
        "session": pipeline_session,
        "instance_type": "local",
        "role": DUMMY_ROLE,

        # We need to use a custom Docker image when we run the pipeline
        # in Local Model on an ARM64 machine.
        "image": "sagemaker-tensorflow-training-toolkit-local" if ARCHITECTURE[0] == "arm64" else None,
        "framework_version": None if ARCHITECTURE[0] == "arm64" else "1.7-1",
        "py_version": None if ARCHITECTURE[0] == "arm64" else "py39",
    }
else:
    config = {
        "session": pipeline_session,
        "instance_type": "ml.c5.xlarge",
        "role": execution_role,
        "image": None,        
        "framework_version": "1.7-1",
        "py_version": "py39",
    }

# By default, The SageMaker SDK logs events related to the default
# configuration using the INFO level. To prevent these from spoiling
# the output of this notebook cells, we can change the logging
# level to ERROR instead.
logging.getLogger("sagemaker.config").setLevel(logging.ERROR)

Windows Support for Local Mode is Experimental


In [5]:
# import boto3
# 
# role = os.environ["ROLE"]
# 
# sagemaker_session = sagemaker.session.Session()
# sagemaker_client = boto3.client("sagemaker")
# iam_client = boto3.client("iam")
# region = boto3.Session().region_name

In [6]:
from sagemaker.s3 import S3Uploader

df_local_path=str(os.environ['DATA_FILEPATH_X']) 
y_local_path=str(os.environ['DATA_FILEPATH_Y'])

# S3Uploader.upload(local_path=df_local_path, desired_s3_uri=f"{S3_LOCATION}/data", sagemaker_session=sagemaker_session)
# S3Uploader.upload(local_path=y_local_path, desired_s3_uri=f"{S3_LOCATION}/data", sagemaker_session=sagemaker_session)

In [7]:
from sagemaker.workflow.steps import CacheConfig

cache_config = CacheConfig(enable_caching=True, expire_after="15d")

In [8]:
from sagemaker.sklearn.processing import SKLearnProcessor
from sagemaker.workflow.steps import ProcessingStep
from sagemaker.processing import ProcessingInput, ProcessingOutput
from sagemaker.workflow.parameters import ParameterString

dataset_location = ParameterString(
    name="dataset_location",
    default_value=f"{S3_LOCATION}/data",
)

processor = SKLearnProcessor(
    base_job_name="split-and-transform-data",
    framework_version="1.2-1",
    instance_type=config["instance_type"],
    instance_count=2,
    role=config["role"],
    sagemaker_session=config["session"],
)

split_and_transform_data_step = ProcessingStep(
    name="split-and-transform-data",
    step_args=processor.run(
        code=f"{CODE_FOLDER}/preprocessor.py",
        inputs=[
            ProcessingInput(source=dataset_location, destination="/opt/ml/processing/input"),  
        ],
        outputs=[
            ProcessingOutput(output_name="train", source="/opt/ml/processing/train"),
            ProcessingOutput(output_name="validation", source="/opt/ml/processing/validation"),
            ProcessingOutput(output_name="test", source="/opt/ml/processing/test"),
            ProcessingOutput(output_name="model", source="/opt/ml/processing/model"),
            
            # The baseline output points to the test set before transforming the data. This set
            # will be helpful to generate a quality baseline for the model performance.
            ProcessingOutput(output_name="baseline", source="/opt/ml/processing/baseline"),
        ]
    ),
    cache_config=cache_config
)

In [9]:
from sagemaker.workflow.pipeline import Pipeline
from sagemaker.workflow.pipeline_definition_config import PipelineDefinitionConfig

pipeline_definition_config = PipelineDefinitionConfig(use_custom_job_prefix=True)

session1_pipeline = Pipeline(
    name="session1-pipeline",
    parameters=[dataset_location],
    steps=[
        split_and_transform_data_step,
    ],
    pipeline_definition_config=pipeline_definition_config,
    sagemaker_session=config['session'],
)

session1_pipeline.upsert(role_arn=config["role"])

{'PipelineArn': 'session1-pipeline'}

In [10]:
# session1_pipeline.start()

In [11]:
from sagemaker.xgboost import XGBoost

instance_type = "ml.c5.xlarge"

estimator = XGBoost(
    base_job_name="training",
    entry_point=f"{CODE_FOLDER}/train.py",
    role=config['role'],
    instance_count=1,
    instance_type=config['instance_type'],
    framework_version=config['framework_version'],
    image_uri=config["image"],
    disable_profiler=True,
)

INFO:botocore.credentials:Found credentials in shared credentials file: ~/.aws/credentials
INFO:botocore.credentials:Found credentials in shared credentials file: ~/.aws/credentials
INFO:sagemaker.image_uris:Ignoring unnecessary Python version: py3.
INFO:sagemaker.image_uris:Ignoring unnecessary instance type: local.


In [12]:
from sagemaker.workflow.steps import TrainingStep
from sagemaker.inputs import TrainingInput


# train_model_step = TrainingStep(
#     name="train-model",
#     step_args=estimator.fit(
#         inputs={
#             "train": TrainingInput(
#                 s3_data=split_and_transform_data_step.properties.ProcessingOutputConfig.Outputs[
#                     "train"
#                 ].S3Output.S3Uri,
#                 content_type="text/csv"
#             ),
#             "validation": TrainingInput(
#                 s3_data=split_and_transform_data_step.properties.ProcessingOutputConfig.Outputs[
#                     "validation"
#                 ].S3Output.S3Uri,
#                 content_type="text/csv"
#             )
#         }
#     ),
#     cache_config=cache_config
# )

train_model_step = TrainingStep(
    name="train-model",
    estimator=estimator,
    inputs={
        "train": TrainingInput(
            s3_data=split_and_transform_data_step.properties.ProcessingOutputConfig.Outputs["train"].S3Output.S3Uri,
            content_type="text/csv"
        ),
        "validation": TrainingInput(
            s3_data=split_and_transform_data_step.properties.ProcessingOutputConfig.Outputs["validation"].S3Output.S3Uri,
            content_type="text/csv"
        )
    },
    cache_config=cache_config
)

In [13]:
USE_TUNING_STEP = False and not LOCAL_MODE

In [14]:
from sagemaker.tuner import HyperparameterTuner
from sagemaker.parameter import IntegerParameter, ContinuousParameter

hyperparameter_ranges = {
    'max_depth': IntegerParameter(min_value=4, max_value=12, scaling_type="Auto"),
    'n_estimators': IntegerParameter(min_value=100, max_value=800, scaling_type="Auto"),
    'subsample': ContinuousParameter(min_value=0.5, max_value=1.0, scaling_type="Auto"),
    'colsample_bytree': ContinuousParameter(min_value=0.7, max_value=1.0, scaling_type="Logarithmic"),
    'colsample_bylevel': ContinuousParameter(min_value=0.8, max_value=1.0, scaling_type="Logarithmic"),
    'lambda': ContinuousParameter(min_value=0.1, max_value=5.0, scaling_type="Logarithmic"),
    'alpha': ContinuousParameter(min_value=0.1, max_value=20, scaling_type="Logarithmic"),
    'eta': ContinuousParameter(min_value=0.015, max_value=0.3, scaling_type="Logarithmic"),
    'min_child_weight': ContinuousParameter(min_value=10, max_value=35, scaling_type="Auto"),
    'colsample_bynode': ContinuousParameter(min_value=0.9, max_value=1, scaling_type="Logarithmic"),
    'num_round': IntegerParameter(min_value=10, max_value=30, scaling_type="Auto"),
}

objective_type = "Maximize"
metric_definitions = [{'Name': 'custom:f1', 'Regex': 'F1 score: ([0-9\\.]+)'}]
metric_name = "custom:f1"
strategy = "Bayesian"

tuner = HyperparameterTuner(
    estimator,
    objective_metric_name=metric_name,
    objective_type=objective_type,
    hyperparameter_ranges=hyperparameter_ranges,
    metric_definitions=metric_definitions,
    max_jobs=3,
    max_parallel_jobs=3,
)

In [15]:
from sagemaker.workflow.steps import TuningStep

tune_model_step = TuningStep(
    name="tune-model",
    tuner=tuner,
    inputs={
        "train": TrainingInput(
            s3_data=split_and_transform_data_step.properties.ProcessingOutputConfig.Outputs[
                "train"
            ].S3Output.S3Uri,
            content_type="text/csv"
        ),
        "validation": TrainingInput(
            s3_data=split_and_transform_data_step.properties.ProcessingOutputConfig.Outputs[
                "validation"
            ].S3Output.S3Uri,
            content_type="text/csv"
        )
    },
    cache_config=cache_config
)

In [16]:
session2_pipeline = Pipeline(
    name="session2-pipeline",
    parameters=[dataset_location],
    steps=[
        split_and_transform_data_step,
        tune_model_step if USE_TUNING_STEP else train_model_step,
    ],
    pipeline_definition_config=pipeline_definition_config,
    sagemaker_session=config["session"],
)

session2_pipeline.upsert(role_arn=config['role'])

{'PipelineArn': 'session2-pipeline'}

In [17]:
session2_pipeline.start()

INFO:sagemaker.local.entities:Starting execution for pipeline session2-pipeline. Execution ID is bcbb4212-df80-48ac-8329-695d857ef828
INFO:sagemaker.local.entities:Starting pipeline step: 'split-and-transform-data'
INFO:sagemaker.local.image:'Docker Compose' found using Docker CLI.
INFO:sagemaker.local.local_session:Starting processing job
INFO:sagemaker.local.image:Using the long-lived AWS credentials found in session
INFO:sagemaker.local.image:docker compose file: 
networks:
  sagemaker-local:
    name: sagemaker-local
services:
  algo-1-lo28j:
    container_name: p3ptl309nh-algo-1-lo28j
    entrypoint: &id001
    - python3
    - /opt/ml/processing/input/code/preprocessor.py
    environment:
    - '[Masked]'
    - '[Masked]'
    image: 662702820516.dkr.ecr.eu-north-1.amazonaws.com/sagemaker-scikit-learn:1.2-1-cpu-py3
    networks:
      sagemaker-local:
        aliases:
        - algo-1-lo28j
    stdin_open: true
    tty: true
    volumes:
    - C:\Users\kamil\AppData\Local\Temp\tmp4

 Container p3ptl309nh-algo-1-lo28j  Creating
 Container s7rs71go4w-algo-2-lo28j  Creating
 Container p3ptl309nh-algo-1-lo28j  Created
 Container s7rs71go4w-algo-2-lo28j  Created
Attaching to p3ptl309nh-algo-1-lo28j, s7rs71go4w-algo-2-lo28j
p3ptl309nh-algo-1-lo28j exited with code 0
Aborting on container exit...
 Container s7rs71go4w-algo-2-lo28j  Stopping
 Container p3ptl309nh-algo-1-lo28j  Stopping
 Container p3ptl309nh-algo-1-lo28j  Stopped
 Container s7rs71go4w-algo-2-lo28j  Stopped


INFO:sagemaker.local.image:===== Job Complete =====
INFO:sagemaker.local.entities:Pipeline step 'split-and-transform-data' SUCCEEDED.
INFO:sagemaker.local.entities:Starting pipeline step: 'train-model'
INFO:sagemaker.local.image:'Docker Compose' found using Docker CLI.
INFO:sagemaker.local.local_session:Starting training job
INFO:sagemaker.local.image:Using the long-lived AWS credentials found in session
INFO:sagemaker.local.image:docker compose file: 
networks:
  sagemaker-local:
    name: sagemaker-local
services:
  algo-1-rzde0:
    command: train
    container_name: gh6ef30r8p-algo-1-rzde0
    environment:
    - '[Masked]'
    - '[Masked]'
    - '[Masked]'
    - '[Masked]'
    - '[Masked]'
    image: 662702820516.dkr.ecr.eu-north-1.amazonaws.com/sagemaker-xgboost:1.7-1
    networks:
      sagemaker-local:
        aliases:
        - algo-1-rzde0
    stdin_open: true
    tty: true
    volumes:
    - C:\Users\kamil\AppData\Local\Temp\tmpjyhhl_2x\algo-1-rzde0\input:/opt/ml/input
    - 

 Container gh6ef30r8p-algo-1-rzde0  Creating
 Container gh6ef30r8p-algo-1-rzde0  Created
Attaching to gh6ef30r8p-algo-1-rzde0
gh6ef30r8p-algo-1-rzde0  | [2024-03-21 17:02:31.545 807596178d6f:1 INFO utils.py:28] RULE_JOB_STOP_SIGNAL_FILENAME: None
gh6ef30r8p-algo-1-rzde0  | [2024-03-21 17:02:31.558 807596178d6f:1 INFO profiler_config_parser.py:111] Unable to find config at /opt/ml/input/config/profilerconfig.json. Profiler is disabled.
gh6ef30r8p-algo-1-rzde0  | [2024-03-21:17:02:31:INFO] Imported framework sagemaker_xgboost_container.training
gh6ef30r8p-algo-1-rzde0  | [2024-03-21:17:02:31:INFO] No GPUs detected (normal if no gpus installed)
gh6ef30r8p-algo-1-rzde0  | [2024-03-21:17:02:31:INFO] Invoking user training script.
gh6ef30r8p-algo-1-rzde0  | [2024-03-21:17:02:32:INFO] Module train does not provide a setup.py. 
gh6ef30r8p-algo-1-rzde0  | Generating setup.py
gh6ef30r8p-algo-1-rzde0  | [2024-03-21:17:02:32:INFO] Generating setup.cfg
gh6ef30r8p-algo-1-rzde0  | [2024-03-21

INFO:root:creating C:\Users\kamil\AppData\Local\Temp\tmpjyhhl_2x\artifacts\output\data
INFO:root:creating C:\Users\kamil\AppData\Local\Temp\tmpjyhhl_2x\artifacts\model\001
INFO:root:copying C:\Users\kamil\AppData\Local\Temp\tmpjyhhl_2x\model\001\saved_model.bst -> C:\Users\kamil\AppData\Local\Temp\tmpjyhhl_2x\artifacts\model\001


gh6ef30r8p-algo-1-rzde0 exited with code 0
Aborting on container exit...
 Container gh6ef30r8p-algo-1-rzde0  Stopping
 Container gh6ef30r8p-algo-1-rzde0  Stopped


INFO:sagemaker.local.image:===== Job Complete =====
INFO:sagemaker.local.entities:Pipeline step 'train-model' SUCCEEDED.
INFO:sagemaker.local.entities:Pipeline execution bcbb4212-df80-48ac-8329-695d857ef828 SUCCEEDED


<sagemaker.local.entities._LocalPipelineExecution at 0x1b1fbe46cc0>

In [18]:
session2_pipeline.describe()

{'PipelineArn': 'session2-pipeline',
 'PipelineDefinition': '{"Version": "2020-12-01", "Metadata": {}, "Parameters": [{"Name": "dataset_location", "Type": "String", "DefaultValue": "s3://football-data-kamil/football/data"}], "PipelineExperimentConfig": {"ExperimentName": {"Get": "Execution.PipelineName"}, "TrialName": {"Get": "Execution.PipelineExecutionId"}}, "Steps": [{"Name": "split-and-transform-data", "Type": "Processing", "Arguments": {"ProcessingJobName": "split-and-transform-data", "ProcessingResources": {"ClusterConfig": {"InstanceType": "local", "InstanceCount": 2, "VolumeSizeInGB": 30}}, "AppSpecification": {"ImageUri": "662702820516.dkr.ecr.eu-north-1.amazonaws.com/sagemaker-scikit-learn:1.2-1-cpu-py3", "ContainerEntrypoint": ["python3", "/opt/ml/processing/input/code/preprocessor.py"]}, "RoleArn": "arn:aws:iam::111111111111:role/service-role/AmazonSageMaker-ExecutionRole-11111111111111", "ProcessingInputs": [{"InputName": "input-1", "AppManaged": false, "S3Input": {"S3Uri"