In [1]:
import os
from datetime import datetime
from sagemaker import get_execution_role
from sagemaker.pytorch import PyTorch

In [2]:
os.environ['AWS_DEFAULT_REGION'] = "us-east-1"
# this is all for naming
date_str=datetime.now().strftime("%d-%m-%Y")
time_str=datetime.now().strftime("%d-%m-%Y-%H-%M-%S")

S3_BUCKET = 'jbsnyder-sagemaker-us-east' # Don't include s3:// in your bucket name
S3_DIR = 'ptlightning-tutorial'
R50_WEIGHTS="resnet50.pkl"

user_id="jbsnyder"

# specify training type, s3 src and nodes
instance_type="ml.p4d.24xlarge" # This can be any of 'ml.p3dn.24xlarge', 'ml.p4d.24xlarge', 'ml.p3.16xlarge', 'ml.p3.8xlarge', 'ml.p3.2xlarge', 'ml.g4dn.12xlarge'
nodes=1
s3_location=os.path.join("s3://", S3_BUCKET, S3_DIR)
role=get_execution_role() #give Sagemaker permission to launch nodes on our behalf
source_dir='../sm_src'

entry_point = "run.sh"

docker_image = "[acct_num].dkr.ecr.us-east-1.amazonaws.com/jbsnyder:pytorch-mlperf"

job_name = f'{user_id}-{time_str}'
output_path = os.path.join(s3_location, "sagemaker-output", date_str, job_name)
code_location = os.path.join(s3_location, "sagemaker-code", date_str, job_name)



In [3]:
estimator = PyTorch(
                entry_point=entry_point, 
                source_dir=source_dir, 
                image_uri=docker_image,
                role=role,
                instance_count=nodes,
                instance_type=instance_type,
                distribution=None,
                output_path=output_path,
                checkpoint_s3_uri=output_path,
                model_dir=output_path,
                volume_size=500,
                code_location=code_location,
                disable_profiler=True, # Reduce number of logs since we don't need profiler or debugger for this training
                debugger_hook_config=False,
)

In [4]:
estimator.fit(wait=False, job_name=job_name)