In [None]:
!git clone https://github.com/salesforce/LAVIS.git

In [None]:
!curl -L https://github.com/peak/s5cmd/releases/download/v2.2.2/s5cmd_2.2.2_Linux-64bit.tar.gz | tar -xz s5cmd

In [None]:
import sagemaker
import boto3
from sagemaker import get_execution_role

sess = sagemaker.Session()
role = get_execution_role()
sagemaker_default_bucket = sess.default_bucket()
region = sess.boto_session.region_name

In [None]:
!cp s5cmd LAVIS/
# !cp requirements.txt LAVIS/
!cp entry.py LAVIS/
!cp caption_coco_ft_2.yaml LAVIS/
!cp default.yaml LAVIS/lavis/configs/
!cp caption_builder.py LAVIS/lavis/datasets/builders/
!cp train.py LAVIS/

In [None]:
import time
from datetime import datetime
from sagemaker.estimator import Estimator

## pre-built docker in https://github.com/aws/deep-learning-containers/blob/master/available_images.md
# image_uri = f'763104351884.dkr.ecr.{region}.amazonaws.com/pytorch-training:1.13.1-gpu-py39-cu117-ubuntu20.04-sagemaker'

image_uri = f'633205212955.dkr.ecr.{region}.amazonaws.com/sagemaker-torch113-cu117-jre:latest'


instance_count = 1
instance_type = 'ml.p4d.24xlarge' ## p4d - 8*40G / p4de - 8*80G

ts_str = str(datetime.now().strftime("%Y-%m-%d-%H-%M-%S"))
model_output_path = f's3://{sagemaker_default_bucket}/output-models/blip-caption/{ts_str}/' 

environment = {
    'CONFIG_FILE': 'caption_coco_ft_2.yaml',
    'TASK_FILE': 'train.py',
    'NODE_NUMBER': str(instance_count),
    'DATA_S3_PATH': f's3://llm-artifacts-us-east-1/datasets/coco-full/', ### coco2014, coco-full
    # 'MODEL_S3_PATH': f's3://llm-artifacts-us-east-1/blip-pretrain-pth/', # source model files
    'ANNO_S3_PATH': f's3://llm-artifacts-us-east-1/datasets/cocokarp-anno/', ### coco2014-anno, cocokarp-anno
    'OUTPUT_MODEL_S3_PATH': model_output_path, # destination s3
    # 'TOTAL_NUM_SAMPLES': str(2000) # dataset has 101 files total in hierachical dir
}

hyp_param = {
    'output_dir': '/tmp/local_output/' # output dir in training container
}

estimator = Estimator(role=role,
                      entry_point='entry.py',
                      source_dir='./LAVIS',
                      base_job_name='blip2-coco-cap',
                      instance_count=instance_count,
                      instance_type=instance_type,
                      image_uri=image_uri,
                      environment=environment,
                      hyperparameters=hyp_param,
                      max_run=2*24*3600, #任务最大存续时间，默认2day，需要提交ticket提升quota最大28天
                      keep_alive_period_in_seconds=3600, #warmpool，为下一次训练保持机器&镜像（滚动续期，最大1hour）；需要开quota。
                      # input_mode='FastFile', # https://sagemaker.readthedocs.io/en/stable/api/training/estimators.html
                      disable_profiler=True,
                      debugger_hook_config=False)


# # data in channel will be automatically copied to each node - /opt/ml/input/data/train1
# # should change data_path param to above path in torchrun
# input_channel = {'train1': f's3://{sagemaker_default_bucket}/datasets/coig_alpaca_jsonline_data/'}
# estimator.fit(input_channel)

estimator.fit()

In [None]:
!aws s3 ls {model_output_path}