In [1]:
import boto3
import sagemaker
import os
import pandas as pd

from sagemaker import get_execution_role

%load_ext dotenv
%dotenv

sagemaker.config INFO - Not applying SDK defaults from location: C:\ProgramData\sagemaker\sagemaker\config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: C:\Users\kamil\AppData\Local\sagemaker\sagemaker\config.yaml


In [2]:
region = boto3.Session().region_name
print("AWS Region: {}".format(region))

role = os.environ["ROLE"]
print("RoleArn: {}".format(role))

AWS Region: eu-north-1
RoleArn: arn:aws:iam::284415450706:role/service-role/AmazonSageMaker-ExecutionRole-20240309T101533


In [3]:
bucket = os.environ["BUCKET"]
prefix = "DEMO-smdebug-xgboost-home-win-prediction"

In [4]:
import shap

X, y = shap.datasets.adult()

# X_display, y_display = shap.datasets.adult(display=True)
# feature_names = list(X.columns)

In [5]:
# create a train/test split
from sklearn.model_selection import train_test_split  # For splitting the dataset

# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=7)
# X_train_display = X_display.loc[X_train.index]

In [6]:
# train = pd.concat(
#     [pd.Series(y_train, index=X_train.index, name="Income>50K", dtype=int), X_train],
#     axis=1,
# )
# test = pd.concat(
#     [pd.Series(y_test, index=X_test.index, name="Income>50K", dtype=int), X_test],
#     axis=1,
# )

# Use 'csv' format to store the data
# The first column is expected to be the output column
# train.to_csv("train.csv", index=False, header=False)
# test.to_csv("validation.csv", index=False, header=False)
# 
# boto3.Session().resource("s3").Bucket(bucket).Object(
#     os.path.join(prefix, "data/train.csv")
# ).upload_file("train.csv")
# boto3.Session().resource("s3").Bucket(bucket).Object(
#     os.path.join(prefix, "data/validation.csv")
# ).upload_file("validation.csv")

In [7]:
from pathlib import Path
from pythonProject.program.code.containers.SageMakerContainerBuilder import SageMakerContainerBuilder

LOCAL_MODE = True
CODE_FOLDER = Path("code")
image_name = 'xgb-clf-custom-training-container'

builder = SageMakerContainerBuilder(
    code_folder=CODE_FOLDER,
    image_name=image_name,
    local_mode=LOCAL_MODE
)

# training_container_image = builder.build_and_push()
# print(training_container_image)
training_container_image = builder.get_image_uri()
print(training_container_image)

284415450706.dkr.ecr.eu-north-1.amazonaws.com/xgb-clf-custom-training-container:latest


In [8]:
from sagemaker import ProfilerConfig, Profiler
from sagemaker.debugger import DebuggerHookConfig, CollectionConfig, Rule, rule_configs
from sagemaker.estimator import Estimator

use_spot_instances = True and not LOCAL_MODE
max_run = 500
max_wait = 800 if use_spot_instances else None
instance_type = 'ml.m5.4xlarge'
save_interval = 2

base_job_name = "demo-smdebug-xgboost-home-win-prediction-classification"
bucket_path = "s3://{}".format(bucket)

debugger_hook_config = DebuggerHookConfig(
    s3_output_path=bucket_path,
    collection_configs=[
        CollectionConfig(
            name="average_shap",
            parameters={
                "save_interval": str(save_interval)
            }
        ),
        CollectionConfig(
            name="full_shap",
            parameters={
                "save_interval": str(save_interval)
            }
        ),
    ]
)

rules = [Rule.sagemaker(rule_configs.create_xgboost_report())]

xgb_estimator = Estimator(
    image_uri=training_container_image,
    instance_count=2,
    py_version="py310",
    base_job_name=base_job_name,
    instance_type=instance_type,
    # sagemaker_session=,
    role=role,
    use_spot_instances=use_spot_instances,
    max_run=max_run,
    max_wait=max_wait,
    disable_profiler=False,
    # debugger_hook_config=DebuggerHookConfig(
    #     s3_output_path=bucket_path,  # Required
    #     collection_configs=[
    #         # CollectionConfig(name="metrics", parameters={"save_interval": str(save_interval)}),
    #         # CollectionConfig(
    #         #     name="feature_importance",
    #         #     parameters={"save_interval": str(save_interval)},
    #         # ),
    #         CollectionConfig(name="full_shap", parameters={"save_interval": str(save_interval)}),
    #         CollectionConfig(name="average_shap", parameters={"save_interval": str(save_interval)}),
    #     ],
    # ),
    rules=[
        Rule.sagemaker(
            rule_configs.create_xgboost_report(),
        )
        # Rule.sagemaker(
        #     rule_configs.class_imbalance(),
        #     rule_parameters={
        #         "threshold_imbalance": "10",
        #         "threshold_misprediction": "0.7",
        #     },
        # ),
    ],

)

sagemaker.config INFO - Not applying SDK defaults from location: C:\ProgramData\sagemaker\sagemaker\config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: C:\Users\kamil\AppData\Local\sagemaker\sagemaker\config.yaml


In [9]:
from sagemaker.session import TrainingInput

train_input = TrainingInput(
    "s3://football-data-kamil/football-pipeline/6k6z5zfhjtqf/split-and-transform-data/output/train/train.csv"
)
validation_input = TrainingInput(
    "s3://football-data-kamil/football-pipeline/6k6z5zfhjtqf/split-and-transform-data/output/validation/validation.csv"
)

# s3://football-data-kamil/DEMO-smdebug-xgboost-home-win-prediction\data/train.csv
# train_input = TrainingInput(f"s3://football-data-kamil/DEMO-smdebug-xgboost-home-win-prediction\data/train.csv", content_type="text/csv")
# validation_input = TrainingInput(f"s3://football-data-kamil/DEMO-smdebug-xgboost-home-win-prediction\data/validation.csv", content_type="text/csv")

pipeline_input = TrainingInput(
    "s3://football-data-kamil/football-pipeline/6k6z5zfhjtqf/split-and-transform-data/output/model/model.tar.gz"
)



xgb_estimator.fit(
    {"train": train_input, "validation": validation_input, "pipeline": pipeline_input},
    # This is a fire and forget event. By setting wait=False, you submit the job to run in the background.
    # Amazon SageMaker starts one training job and release control to next cells in the notebook.
    # Follow this notebook to see status of the training job.
    wait=False,
)

INFO:sagemaker:Creating training-job with name: demo-smdebug-xgboost-home-win-predictio-2024-09-01-16-55-55-916


In [11]:
import time

for _ in range(36):
    job_name = xgb_estimator.latest_training_job.name
    client = xgb_estimator.sagemaker_session.sagemaker_client
    description = client.describe_training_job(TrainingJobName=job_name)
    training_job_status = description["TrainingJobStatus"]
    rule_job_summary = xgb_estimator.latest_training_job.rule_job_summary()
    rule_evaluation_status = rule_job_summary[0]["RuleEvaluationStatus"]
    print(
        "Training job status: {}, Rule Evaluation Status: {}".format(
            training_job_status, rule_evaluation_status
        )
    )

    if training_job_status in ["Completed", "Failed"]:
        break

    time.sleep(10)

Training job status: Completed, Rule Evaluation Status: NoIssuesFound


In [12]:
from smdebug.trials import create_trial

s3_output_path = xgb_estimator.latest_job_debugger_artifacts_path()
trial = create_trial(s3_output_path)

[2024-09-01 19:03:24.212 DESKTOP-Q66KT1Q:30300 INFO utils.py:28] RULE_JOB_STOP_SIGNAL_FILENAME: None
[2024-09-01 19:03:24.227 DESKTOP-Q66KT1Q:30300 INFO s3_trial.py:42] Loading trial debug-output at path s3://sagemaker-eu-north-1-284415450706/demo-smdebug-xgboost-home-win-predictio-2024-09-01-16-55-55-916\debug-output


KeyboardInterrupt: 

In [17]:
rule_output_path = xgb_estimator.output_path + xgb_estimator.latest_training_job.job_name + "/rule-output"
rule_output_path

's3://sagemaker-eu-north-1-284415450706/demo-smdebug-xgboost-home-win-predictio-2024-09-01-16-55-55-916/rule-output'

In [18]:
! aws s3 ls {rule_output_path} --recursive

2024-09-01 18:58:46     373700 demo-smdebug-xgboost-home-win-predictio-2024-09-01-16-55-55-916/rule-output/CreateXgboostReport/xgboost_report.html
2024-09-01 18:58:44     173452 demo-smdebug-xgboost-home-win-predictio-2024-09-01-16-55-55-916/rule-output/CreateXgboostReport/xgboost_report.ipynb


In [19]:
! aws s3 cp {rule_output_path} ./report/ --recursive

Completed 169.4 KiB/534.3 KiB (451.5 KiB/s) with 2 file(s) remaining
download: s3://sagemaker-eu-north-1-284415450706/demo-smdebug-xgboost-home-win-predictio-2024-09-01-16-55-55-916/rule-output/CreateXgboostReport/xgboost_report.ipynb to report\CreateXgboostReport\xgboost_report.ipynb
Completed 169.4 KiB/534.3 KiB (451.5 KiB/s) with 1 file(s) remaining
Completed 425.4 KiB/534.3 KiB (1.1 MiB/s) with 1 file(s) remaining  
Completed 534.3 KiB/534.3 KiB (1.2 MiB/s) with 1 file(s) remaining  
download: s3://sagemaker-eu-north-1-284415450706/demo-smdebug-xgboost-home-win-predictio-2024-09-01-16-55-55-916/rule-output/CreateXgboostReport/xgboost_report.html to report\CreateXgboostReport\xgboost_report.html


In [13]:
sagemaker_client = boto3.client('sagemaker')

def get_latest_processing_job():
    processing_jobs = sagemaker_client.list_processing_jobs(SortBy='CreationTime', SortOrder='Descending')['ProcessingJobSummaries']
    if processing_jobs:
        # The latest processing job
        latest_job_name = processing_jobs[0]['ProcessingJobName']
        return latest_job_name
    else:
        raise Exception("No processing jobs found")

In [14]:
import boto3
trn_job_name = get_latest_processing_job()

response = sagemaker_client.describe_processing_job(
    ProcessingJobName=trn_job_name
)

print(response['ProcessingJobStatus'])
print(response.get('FailureReason', 'No failure reason provided'))

Completed
No failure reason provided
