In [1]:
import os
from datetime import datetime

import boto3
from sagemaker import analytics, image_uris
from sagemaker.pytorch import PyTorch
from sagemaker import get_execution_role
from sagemaker.debugger import (
    Rule,
    DebuggerHookConfig,
    TensorBoardOutputConfig,
    CollectionConfig,
    ProfilerConfig,
    FrameworkProfile,
    DetailedProfilingConfig,
    rule_configs,
    ProfilerRule,
)
from smdebug.core.collection import CollectionKeys

In [2]:
#import sagemaker
#import botocore

In [3]:
#!pip install --upgrade sagemaker boto3 botocore

In [4]:
profiler_config=ProfilerConfig(
    system_monitor_interval_millis=500,
)

In [5]:
os.environ['AWS_DEFAULT_REGION']= 'us-west-2'

In [6]:
time_str = datetime.now().strftime("%d-%m-%Y-%H-%M-%S")

region = boto3.session.Session().region_name
boto_sess = boto3.Session()
sm = boto_sess.client('sagemaker')

s3_bucket = "s3://jbsnyder-sagemaker-us-west-2/"

base_job_name = "openclip-scaling"
date_str = datetime.now().strftime("%d-%m-%Y")
time_str = datetime.now().strftime("%d-%m-%Y-%H-%M-%S")
job_name = f"{base_job_name}-{time_str}"

output_path = os.path.join(s3_bucket, "sagemaker-output", date_str, job_name)
code_location = os.path.join(s3_bucket, "sagemaker-code", date_str, job_name)

In [7]:
nodes = 4
batch_size = 512
global_batch_size = batch_size * nodes * 8
samples_per_epoch = global_batch_size * 1000
epochs = 50
lr = 1e-4/4096*global_batch_size

In [8]:
hyperparameters = {"train-data": "\"pipe:aws s3 cp s3://jbsnyder-sagemaker-us-west-2/data/laion/laion400m/data/{00000..02499}.tar -\"", 
                   "train-num-samples": samples_per_epoch, 
                   "val-data": "\"pipe:aws s3 cp s3://jbsnyder-sagemaker-us-west-2/data/laion/laion400m/data/{02500..02587}.tar -\"",
                   "val-num-samples": 10000,
                   "batch-size": batch_size,
                   "warmup": 2000,
                   "epochs": epochs,
                   "lr": lr,
                   "wd": 0.2,
                   "precision": "amp",
                   "workers": 4,
                   "log-every-n-steps": 100,
                   "log-path": "/opt/ml/checkpoints/logs/",
                   "dist-backend": "smddp"
                   }

distribution = { "smdistributed": { "dataparallel": { "enabled": True } } }
entry_point = "sm_train.py"

In [9]:
instance_type = 'ml.p4d.24xlarge'

image_uri = "920076894685.dkr.ecr.us-west-2.amazonaws.com/jbsnyder:open-clip-sm"

In [10]:
estimator = PyTorch(
    source_dir="./src",
    entry_point=entry_point,
    base_job_name=job_name,
    role=get_execution_role(),
    instance_count=nodes,
    instance_type=instance_type,
    distribution=distribution,
    max_run=36000,
    hyperparameters=hyperparameters,
    image_uri=image_uri,
    output_path=os.path.join(output_path, 'training-output'),
    checkpoint_s3_uri=os.path.join(output_path, 'training-checkpoints'),
    model_dir=os.path.join(output_path, 'training-model'),
    code_location=code_location,
    profiler_config=profiler_config,
    debugger_hook_config=False
)

In [11]:
estimator.fit(
    inputs=None,
    wait=False,
    job_name=job_name,
)

INFO:sagemaker:Creating training-job with name: openclip-scaling-24-01-2023-21-17-32


In [12]:
estimator.logs()

2023-01-24 21:17:35 Starting - Starting the training job......
2023-01-24 21:18:18 Starting - Preparing the instances for training..............................
2023-01-24 21:23:38 Downloading - Downloading input data...
2023-01-24 21:23:48 Training - Downloading the training image...........................
2023-01-24 21:28:30 Training - Training image download completed. Training in progress.[35mbash: cannot set terminal process group (-1): Inappropriate ioctl for device[0m
[35mbash: no job control in this shell[0m
[35m2023-01-24 21:28:33,354 sagemaker-training-toolkit INFO     Imported framework sagemaker_pytorch_container.training[0m
[35m2023-01-24 21:28:33,425 sagemaker-training-toolkit INFO     No Neurons detected (normal if no neurons installed)[0m
[35m2023-01-24 21:28:33,435 sagemaker_pytorch_container.training INFO     Block until all host DNS lookups succeed.[0m
[35m2023-01-24 21:28:33,442 sagemaker_pytorch_container.training INFO     Invoking SMDataParallel[0m
[

KeyboardInterrupt: 

In [14]:
estimator.checkpoint_s3_uri

's3://jbsnyder-sagemaker-us-west-2/sagemaker-output/24-01-2023/openclip-scaling-24-01-2023-21-17-32/training-checkpoints'

In [15]:
import numpy as np

In [17]:
np.sqrt(1e-10)

1e-05

In [19]:
import torch

In [None]:
torch.op