In [1]:
%pip freeze | grep sagemaker

sagemaker==2.235.2
sagemaker-core==1.0.76
sagemaker-experiments==0.1.45
sagemaker_training==4.9.0
Note: you may need to restart the kernel to use updated packages.


In [2]:
from sagemaker.estimator import Estimator
from sagemaker.session import Session
from sagemaker import get_execution_role

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/ec2-user/.config/sagemaker/config.yaml


In [3]:
role = get_execution_role()
sess = Session()

In [17]:
estimator = Estimator(
    image_uri='155954279556.dkr.ecr.us-east-1.amazonaws.com/gs-automl-base-containers/tabular312_sm:1.0',
    role=role,
    instance_count=1,
    instance_type='ml.m5.large',
    hyperparameters={
        "table_name": "automl-classification-experiment",
        "project_hashkey": "2ee07a49",
        "experiment_hashkey": "1cbd8309",
        "dataset_table_name": "automl-dataset",
        "dataset_profile_table_name": "automl-dataset-profile-experiment-result",
        "model_repo_table_name": "automl-model-repo",
        "model_experiment_result_table_name": "automl-classification-experiment",
        "username": "sean@gs.co.kr",
        "job_type": "training",
        "task_token": "1234",
    },
    base_job_name='custom-training',
    sagemaker_session=sess,
    # 태그 설정 (SCP 요구사항 충족)
    tags=[
        {'Key': 'Environment', 'Value': 'dev'},
        {'Key': 'Project', 'Value': 'automl'},
        {'Key': 'Owner', 'Value': 'sean'},
        {'Key': 'CostCenter', 'Value': 'gs-retail'}
    ],
    # 기존 버킷 사용 (버킷 생성 방지)
    output_path=f's3://retail-mlops-edu-202602/output',
)

try:
    estimator.fit()
except ValueError as e:
    print(e)

INFO:sagemaker:Creating training-job with name: custom-training-2026-02-10-14-02-48-837


2026-02-10 14:02:49 Starting - Starting the training job......
2026-02-10 14:03:33 Starting - Preparing the instances for training...
2026-02-10 14:04:23 Downloading - Downloading the training image...
2026-02-10 14:04:53 Training - Training image download completed. Training in progress...[34mTraceback (most recent call last):
  File "/usr/local/bin/train", line 3, in <module>
    from sagemaker_training.cli.train import main
  File "/usr/local/lib/python3.12/site-packages/sagemaker_training/cli/train.py", line 14, in <module>
    from sagemaker_training import trainer
  File "/usr/local/lib/python3.12/site-packages/sagemaker_training/trainer.py", line 23, in <module>
    from sagemaker_training import (
  File "/usr/local/lib/python3.12/site-packages/sagemaker_training/entry_point.py", line 24, in <module>
    from retrying import retry[0m
[34mModuleNotFoundError: No module named 'retrying'[0m

2026-02-10 14:05:17 Uploading - Uploading generated training model
2026-02-10 14:05:17

In [13]:
# 이후, 최신 트레이닝 잡 정보 접근
training_job_name = estimator.latest_training_job.name
print("Training Job Name:", training_job_name)

# CloudWatch 로그 그룹과 스트림 이름 가져오기
log_group = f"/aws/sagemaker/TrainingJobs"
log_stream = f"{training_job_name}/algo-1"

print("Log Group:", log_group)
print("Log Stream:", log_stream)

Training Job Name: custom-training-2026-02-10-13-57-11-566
Log Group: /aws/sagemaker/TrainingJobs
Log Stream: custom-training-2026-02-10-13-57-11-566/algo-1


In [14]:
import boto3

logs_client = boto3.client("logs", region_name="us-east-1")

In [15]:
streams = logs_client.describe_log_streams(
    logGroupName="/aws/sagemaker/TrainingJobs",
    logStreamNamePrefix=f"{training_job_name}/algo-1"
)

for stream in streams['logStreams']:
    print(stream['logStreamName'])  # 전체 UUID 포함된 스트림 이름


custom-training-2026-02-10-13-57-11-566/algo-1-1770731887
