In [3]:
import boto3
import sagemaker
from datetime import datetime
from smdebug.trials import create_trial

sm_client = boto3.client('sagemaker')

# Function to get the latest training job
def get_latest_training_job():
    # List training jobs and sort by creation time
    training_jobs = sm_client.list_training_jobs(SortBy='CreationTime', SortOrder='Descending')['TrainingJobSummaries']
    if training_jobs:
        # The latest training job
        latest_job_name = training_jobs[0]['TrainingJobName']
        return latest_job_name
    else:
        raise Exception("No training jobs found")

# Function to get the debugger artifacts path for the latest training job
def get_debugger_artifacts_path(training_job_name):
    response = sm_client.describe_training_job(TrainingJobName=training_job_name)
    if 'DebugHookConfig' in response and 'S3OutputPath' in response['DebugHookConfig']:
        s3_output_path = response['DebugHookConfig']['S3OutputPath']
        if not s3_output_path.endswith('/'):
            s3_output_path += '/'
        return s3_output_path + training_job_name + "/"
    else:
        raise Exception("Debugger artifacts path not found for the training job")
# 'xgb-clf-custom-training-container-jmtl1mbhwl5b-rt2jD4gy9m'

# Get the latest training job name
latest_training_job_name = get_latest_training_job()
print(f"Latest Training Job Name: {latest_training_job_name}")

# Get the debugger artifacts path
debugger_artifacts_path = get_debugger_artifacts_path(latest_training_job_name)
print(f"Debugger Artifacts Path: {debugger_artifacts_path}")

# Ensure the path uses forward slashes
debugger_artifacts_path = debugger_artifacts_path.replace('\\', '/')
print(f"Corrected Debugger Artifacts Path: {debugger_artifacts_path}")

# Create a trial using the debugger artifacts path
trial = create_trial(debugger_artifacts_path)
print("Trial created successfully")

Latest Training Job Name: demo-smdebug-xgboost-home-win-predictio-2024-09-01-16-43-22-063
Debugger Artifacts Path: s3://sagemaker-eu-north-1-284415450706/demo-smdebug-xgboost-home-win-predictio-2024-09-01-16-43-22-063/
Corrected Debugger Artifacts Path: s3://sagemaker-eu-north-1-284415450706/demo-smdebug-xgboost-home-win-predictio-2024-09-01-16-43-22-063/
[2024-09-01 18:43:44.782 DESKTOP-Q66KT1Q:26176 INFO s3_trial.py:42] Loading trial  at path s3://sagemaker-eu-north-1-284415450706/demo-smdebug-xgboost-home-win-predictio-2024-09-01-16-43-22-063/


KeyboardInterrupt: 

In [9]:
import sagemaker
import boto3
import os

sagemaker_client = boto3.client('sagemaker')

training_jobs = sagemaker_client.list_training_jobs(SortBy='CreationTime', SortOrder='Descending', MaxResults=1)

if not training_jobs['TrainingJobSummaries']:
    raise ValueError("No training jobs found.")

last_training_job_name = training_jobs['TrainingJobSummaries'][0]['TrainingJobName']

estimator = sagemaker.estimator.Estimator.attach(last_training_job_name)

xgb_profile_job_name = [rule["RuleEvaluationJobArn"].split("/")[-1] 
                        for rule in estimator.latest_training_job.rule_job_summary() 
                        if "CreateXgboostReport" in rule["RuleConfigurationName"]][0]

# Construct the paths for the outputs
base_output_path = os.path.dirname(estimator.latest_job_debugger_artifacts_path())
rule_output_path = os.path.join(base_output_path, "rule-output/")
xgb_report_path = os.path.join(rule_output_path, "CreateXgboostReport")
# profile_report_path = os.path.join(rule_output_path, profiler_report_name)

print(f"Last training job: {last_training_job_name}")
print(f"XGBoost report path: {xgb_report_path}")
# print(f"Profiler report path: {profile_report_path}")

sagemaker.config INFO - Not applying SDK defaults from location: C:\ProgramData\sagemaker\sagemaker\config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: C:\Users\kamil\AppData\Local\sagemaker\sagemaker\config.yaml

2024-09-01 16:49:23 Starting - Preparing the instances for training
2024-09-01 16:49:23 Downloading - Downloading the training image
2024-09-01 16:49:23 Training - Training image download completed. Training in progress.
2024-09-01 16:49:23 Uploading - Uploading generated training model
2024-09-01 16:49:23 Completed - Training job completed


KeyboardInterrupt: 

In [4]:
import boto3

# Initialize the Boto3 client for CloudWatch Logs
logs_client = boto3.client('logs')

# The log group name usually follows this format
log_group_name = '/aws/sagemaker/TrainingJobs'

# Replace this with your training job name
# job_name = 'xgb-clf-custom-training-container-in2tbqwqtlty-q3cahvxZxG'
job_name = 'xgb-clf-custom-training-container-ia2ww3101bp9-a3AVuDPz3a'

# Retrieve the log streams associated with the training job
response = logs_client.describe_log_streams(
    logGroupName=log_group_name,
    logStreamNamePrefix=job_name
)

log_streams = response['logStreams']

# Loop through the log streams to get the logs
for log_stream in log_streams:
    log_stream_name = log_stream['logStreamName']
    print(f"Logs for stream: {log_stream_name}")

    log_events = logs_client.get_log_events(
        logGroupName=log_group_name,
        logStreamName=log_stream_name,
        startFromHead=True
    )

    # Print the logs
    for event in log_events['events']:
        print(event['message'])

Logs for stream: xgb-clf-custom-training-container-ia2ww3101bp9-a3AVuDPz3a/algo-1-1724687684
2024-08-26 15:56:23,553 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)
2024-08-26 15:56:23,554 sagemaker-training-toolkit INFO     No Neurons detected (normal if no neurons installed)
2024-08-26 15:56:23,566 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)
2024-08-26 15:56:23,566 sagemaker-training-toolkit INFO     No Neurons detected (normal if no neurons installed)
2024-08-26 15:56:23,577 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)
2024-08-26 15:56:23,577 sagemaker-training-toolkit INFO     No Neurons detected (normal if no neurons installed)
2024-08-26 15:56:23,589 sagemaker-training-toolkit INFO     Invoking user script
Training Env:
{
    "additional_framework_parameters": {},
    "channel_input_dirs": {
        "pipeline": "/opt/ml/input/data/pipeline",
        "train": "/opt/ml/in

In [7]:
import sagemaker
import boto3
import os

sagemaker_client = boto3.client('sagemaker')

training_jobs = sagemaker_client.list_training_jobs(SortBy='CreationTime', SortOrder='Descending', MaxResults=2)

if not training_jobs['TrainingJobSummaries']:
    raise ValueError("No training jobs found.")

last_training_job_name = training_jobs['TrainingJobSummaries'][1]['TrainingJobName']

print(f"Last training job: {last_training_job_name}")

estimator = sagemaker.estimator.Estimator.attach(last_training_job_name)

Last training job: xgb-clf-training-container-dfeh31dwow0g-JpvKob5MVO

2024-10-28 17:03:46 Starting - Preparing the instances for training
2024-10-28 17:03:46 Downloading - Downloading the training image
2024-10-28 17:03:46 Training - Training image download completed. Training in progress.
2024-10-28 17:03:46 Uploading - Uploading generated training model
2024-10-28 17:03:46 Completed - Training job completed


In [12]:
import time
for _ in range(36):
    job_name = estimator.latest_training_job.name
    client = estimator.sagemaker_session.sagemaker_client
    description = client.describe_training_job(TrainingJobName=job_name)
    training_job_status = description["TrainingJobStatus"]
    rule_job_summary = estimator.latest_training_job.rule_job_summary()
    rule_evaluation_status = rule_job_summary[0]["RuleEvaluationStatus"]
    print(
        "Training job status: {}, rule evaluation status: {}".format(
            training_job_status, rule_evaluation_status
        )
    )

    if training_job_status in ["Completed", "Failed"]:
        break

    time.sleep(10)

Training job status: Completed, rule evaluation status: NoIssuesFound


In [15]:
from smdebug.trials import create_trial

s3_output_path = 's3://football-bucket/train_analyse/xgb-clf-training-container-dfeh31dwow0g-JpvKob5MVO/debug-output/'
trial = create_trial(s3_output_path)

[2024-10-28 19:00:21.250 DESKTOP-Q66KT1Q:7904 INFO s3_trial.py:42] Loading trial  at path s3://football-bucket/train_analyse/xgb-clf-training-container-dfeh31dwow0g-JpvKob5MVO/debug-output/
Invalid bucket name "football-bucket\train_analyse": Bucket name must match the regex "^[a-zA-Z0-9.\-_]{1,255}$" or be an ARN matching the regex "^arn:(aws).*:(s3|s3-object-lambda):[a-z\-0-9]*:[0-9]{12}:accesspoint[/:][a-zA-Z0-9\-.]{1,63}$|^arn:(aws).*:s3-outposts:[a-z\-0-9]+:[0-9]{12}:outpost[/:][a-zA-Z0-9\-]{1,63}[/:]accesspoint[/:][a-zA-Z0-9\-]{1,63}$"
Invalid bucket name "football-bucket\train_analyse": Bucket name must match the regex "^[a-zA-Z0-9.\-_]{1,255}$" or be an ARN matching the regex "^arn:(aws).*:(s3|s3-object-lambda):[a-z\-0-9]*:[0-9]{12}:accesspoint[/:][a-zA-Z0-9\-.]{1,63}$|^arn:(aws).*:s3-outposts:[a-z\-0-9]+:[0-9]{12}:outpost[/:][a-zA-Z0-9\-]{1,63}[/:]accesspoint[/:][a-zA-Z0-9\-]{1,63}$"
Invalid bucket name "football-bucket\train_analyse": Bucket name must match the regex "^[a-zA-

KeyboardInterrupt: 