In [2]:
## Update sagemaker python sdk version
!pip install -U sagemaker

Collecting sagemaker
  Downloading sagemaker-2.221.1-py3-none-any.whl.metadata (14 kB)
Downloading sagemaker-2.221.1-py3-none-any.whl (1.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.5/1.5 MB[0m [31m26.0 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hInstalling collected packages: sagemaker
  Attempting uninstall: sagemaker
    Found existing installation: sagemaker 2.219.0
    Uninstalling sagemaker-2.219.0:
      Successfully uninstalled sagemaker-2.219.0
Successfully installed sagemaker-2.221.1


## prepare data

In [2]:
!python data/preprocess_kl.py --data_path '/home/ec2-user/SageMaker/klook/data0527/original_hotel_image_data.csv' --output_folder '/home/ec2-user/SageMaker/klook/data0527/'

<<< load excel data!
100%|█████████████████████████████████████| 1041/1041 [00:00<00:00, 7141.86it/s]
Data written to json
Data written to json


In [5]:
!ls /home/ec2-user/SageMaker/klook/data0527/

data.json     img			     test.json
data_v2.json  original_hotel_image_data.csv  test_v2.json


In [6]:
#cp data
! aws s3 cp /home/ec2-user/SageMaker/klook/data0527/data_v2.json s3://sagemaker-us-west-2-726335585155/klook/data0527/ 

upload: ../../../klook/data0527/data_v2.json to s3://sagemaker-us-west-2-726335585155/klook/data0527/data_v2.json


In [7]:
! aws s3 cp /home/ec2-user/SageMaker/klook/data0527/test_v2.json s3://sagemaker-us-west-2-726335585155/klook/data0527/ 

upload: ../../../klook/data0527/test_v2.json to s3://sagemaker-us-west-2-726335585155/klook/data0527/test_v2.json


In [8]:
! aws s3 ls s3://sagemaker-us-west-2-726335585155/klook/data0527/ 

                           PRE .ipynb_checkpoints/
                           PRE img/
2024-05-27 09:17:28     465378 data.json
2024-05-28 13:46:02     302114 data_v2.json
2024-05-27 09:17:36     120046 original_hotel_image_data.csv
2024-05-27 09:17:36     466255 test.json
2024-05-28 13:46:12     302645 test_v2.json


## Launch training

In [1]:
import sagemaker
import boto3
from sagemaker import get_execution_role

sess = sagemaker.Session()
role = get_execution_role()
sagemaker_default_bucket = sess.default_bucket()
region = sess.boto_session.region_name

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/ec2-user/.config/sagemaker/config.yaml


In [3]:
import time
from sagemaker.estimator import Estimator

## pre-built docker in https://github.com/aws/deep-learning-containers/blob/master/available_images.md
image_uri = '763104351884.dkr.ecr.us-west-2.amazonaws.com/pytorch-training:1.13.1-gpu-py39-cu117-ubuntu20.04-sagemaker'

instance_count = 1
#instance_type = 'ml.p4d.24xlarge' ## p4d - 8*40G / p4de - 8*80G
instance_type = 'ml.g5.48xlarge' 

bucket=sagemaker.Session().default_bucket()
base_job_name="sagemaker-checkpoint-test"
checkpoint_in_bucket="checkpoints-klook-llava-1.5-13b-hf"
# The S3 URI to store the checkpoints
checkpoint_s3_bucket="s3://{}/{}/{}".format(bucket, base_job_name, checkpoint_in_bucket)


environment = {
    'NODE_NUMBER': str(instance_count),
    'MODEL_NAME_OR_PATH': 'llava-hf/llava-1.5-13b-hf',
    'DATA_PATH': '/opt/ml/input/data/train/data_v2.json',
    'IMAGE_FOLDER':'/opt/ml/input/data/train',
    'OUTPUT_DIR': '/opt/ml/checkpoints',
    'DEVICE_TRAIN_BATCH_SIZE':'4',
    'MODEL_MAX_LENGTH':'2048',
    'EPOCH':'10'
}

estimator = Estimator(role=role,
                      entry_point='entry_single.py',
                      source_dir='./src',
                      base_job_name='klook-llava-train-epoch10-llava15-13b-hf',
                      instance_count=instance_count,
                      instance_type=instance_type,
                      image_uri=image_uri,
                      environment=environment,
                      volume_size=512,
                      max_run=5*24*3600, #任务最大存续时间，默认2day，需要提交ticket提升quota最大28天 keep_alive_period_in_seconds=3600, #warmpool，为下一次训练保持机器&镜像（滚动续期，最大1hour）；需要开quota。
                      disable_profiler=True,
                      debugger_hook_config=False,
                      checkpoint_s3_uri=checkpoint_s3_bucket,
                      checkpoint_local_path='/opt/ml/checkpoints')


# # data in channel will be automatically copied to each node - /opt/ml/input/data/train1
# # should change data_path param to above path in torchrun
input_channel = {'train': 's3://sagemaker-us-west-2-726335585155/klook/data0527/'}
estimator.fit(input_channel)

# estimator.fit()

INFO:sagemaker:Creating training-job with name: klook-llava-train-epoch10-llava15-13b-h-2024-06-04-06-01-53-813


2024-06-04 06:01:56 Starting - Starting the training job...
2024-06-04 06:02:16 Pending - Training job waiting for capacity...
2024-06-04 06:02:50 Pending - Preparing the instances for training.........
2024-06-04 06:03:58 Downloading - Downloading input data...
2024-06-04 06:04:38 Downloading - Downloading the training image...............
2024-06-04 06:07:19 Training - Training image download completed. Training in progress...[34mbash: cannot set terminal process group (-1): Inappropriate ioctl for device[0m
[34mbash: no job control in this shell[0m
[34m2024-06-04 06:07:34,628 sagemaker-training-toolkit INFO     Imported framework sagemaker_pytorch_container.training[0m
[34m2024-06-04 06:07:34,696 sagemaker-training-toolkit INFO     No Neurons detected (normal if no neurons installed)[0m
[34m2024-06-04 06:07:34,709 sagemaker_pytorch_container.training INFO     Block until all host DNS lookups succeed.[0m
[34m2024-06-04 06:07:34,711 sagemaker_pytorch_container.training INFO