## Setup

In [1]:
import sagemaker

bucket_name = "GreatSuperBucket"
sagemaker_session = sagemaker.Session(default_bucket=bucket_name)
bucket = sagemaker_session.default_bucket()
role = sagemaker.get_execution_role()
s3uri = f"s3://{bucket}"

## Prepair Data

In [None]:
input_uri = sagemaker_session.upload_data(path="./dataset", bucket=bucket, key_prefix="dataset")
print(input_uri)

## Load Config

In [1]:
import json
from src.params import HyperParams

hyper_params = ""
with open(f"configs/example.json") as f:
    d = json.load(f)
    txt = json.dumps(d, default=lambda o: o.__dict__)
    try:
        HyperParams.from_json(txt)
    except Exception as e:
        print(f"throw error: {e}")
    else:
        hyper_params = f"'{txt}'"
        print(f"'\\'{txt}\\''")

'\'{"epochs": 10, "batch_size": 8, "lr": 0.001, "dataset_params": {"skiprows": 1}}\''


## Train

In [4]:
from sagemaker.pytorch import PyTorch


estimator = PyTorch(
    entry_point="entry.py",
    source_dir="src",
    role=role,
    py_version="py38",
    framework_version="1.11.0",
    instance_count=1,
    instance_type="ml.c5.xlarge",
    output_path=s3uri,
    code_location=s3uri,
    hyperparameters={
        "hyper_params": hyper_params
    },
    # use spot instance
    # use_spot_instances=True,
    # max_run=20000,
    # max_wait=20000,
)


In [5]:
# dataset の S3 URI を指定して実行
estimator.fit({'train': input_uri})

2022-09-17 15:28:48 Starting - Starting the training job...
2022-09-17 15:29:11 Starting - Preparing the instances for trainingProfilerReport-1663428528: InProgress
.........
2022-09-17 15:30:32 Downloading - Downloading input data...
2022-09-17 15:31:12 Training - Downloading the training image...
2022-09-17 15:31:36 Training - Training image download completed. Training in progress.[34mbash: cannot set terminal process group (-1): Inappropriate ioctl for device[0m
[34mbash: no job control in this shell[0m
[34m2022-09-17 15:31:38,412 sagemaker-training-toolkit INFO     Imported framework sagemaker_pytorch_container.training[0m
[34m2022-09-17 15:31:38,414 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)[0m
[34m2022-09-17 15:31:38,420 sagemaker_pytorch_container.training INFO     Block until all host DNS lookups succeed.[0m
[34m2022-09-17 15:31:38,426 sagemaker_pytorch_container.training INFO     Invoking user training script.[0m
[34m2022-0