In [None]:
## notebook instance - ml.c5.xlarge; jupyter kernel - conda_python3
## Update sagemaker python sdk version
!pip install -U sagemaker

# restart kernel after installation

## Set code

In [1]:
!rm -rf src
!mkdir src

In [2]:
# download training script from github
!cd src && git clone https://github.com/tatsu-lab/stanford_alpaca.git

Cloning into 'stanford_alpaca'...
remote: Enumerating objects: 129, done.[K
remote: Counting objects: 100% (75/75), done.[K
remote: Compressing objects: 100% (25/25), done.[K
remote: Total 129 (delta 57), reused 50 (delta 50), pack-reused 54[K
Receiving objects: 100% (129/129), 9.14 MiB | 5.98 MiB/s, done.
Resolving deltas: 100% (62/62), done.


##### Modify Deepspeed config to save model properply.
Set ```stage3_gather_16bit_weights_on_model_save``` to ```Ture``` if necessary.

In [3]:
import json

ds_config_file = './src/stanford_alpaca/configs/default_offload_opt_param.json'
with open (ds_config_file, 'rb') as f:
    ds_config = json.load(f)
    f.close()

ds_config['zero_optimization']['stage3_gather_16bit_weights_on_model_save'] = True

with open(ds_config_file, 'w') as f:
    json.dump(ds_config, f, indent=2)
    f.close()

In [4]:
# download s5cmd
!curl -L https://github.com/peak/s5cmd/releases/download/v2.0.0/s5cmd_2.0.0_Linux-64bit.tar.gz | tar -xz

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0
100 4176k  100 4176k    0     0  19.4M      0 --:--:-- --:--:-- --:--:-- 19.4M


In [3]:
!cp s5cmd src/
!cp entry.py src/
!cp requirements.txt src/
!cp train.sh src/
## Replace original train.py
!cp train.py src/stanford_alpaca/

## Optional - Put data to s3

In [None]:
#!./s5cmd sync <source_path> <destination_path>
# !aws s3 cp ./src/stanford_alpaca/alpaca_data.json s3://<SOME-BUCKET-NAME>/datasets/alpaca-coig-mix/

## Launch training

In [8]:
import sagemaker
import boto3
from sagemaker import get_execution_role

sess = sagemaker.Session()
role = get_execution_role()
sagemaker_default_bucket = sess.default_bucket()
region = sess.boto_session.region_name

In [None]:
import time
from sagemaker.estimator import Estimator

## pre-built docker in https://github.com/aws/deep-learning-containers/blob/master/available_images.md
# image_uri = '763104351884.dkr.ecr.us-east-1.amazonaws.com/pytorch-training:2.0.0-gpu-py310-cu118-ubuntu20.04-sagemaker'
image_uri = '763104351884.dkr.ecr.us-east-1.amazonaws.com/pytorch-training:1.13.1-gpu-py39-cu117-ubuntu20.04-sagemaker'
# image_uri = "763104351884.dkr.ecr.us-east-1.amazonaws.com/huggingface-pytorch-training:1.13.1-transformers4.26.0-gpu-py39-cu117-ubuntu20.04"

instance_count = 1
instance_type = 'ml.p4d.24xlarge' ## p4d - 8*40G / p4de - 8*80G

environment = {
    'NODE_NUMBER':str(instance_count),
    'MODEL_S3_PATH': 's3://llm-artifacts-us-east-1/bloke-llama2-7b-fp16/*', # source model files
    'OUTPUT_MODEL_S3_PATH': 's3://llm-artifacts-us-east-1/output-models/bloke-llama2-7b-fp16/', # destination
}

estimator = Estimator(role=role,
                      entry_point='entry.py',
                      source_dir='./src',
                      base_job_name='single-node-alpaca-train',
                      instance_count=instance_count,
                      instance_type=instance_type,
                      image_uri=image_uri,
                      environment=environment,
                      max_run=2*24*3600, #任务最大存续时间，默认2day，需要提交ticket提升quota最大28天
                      keep_alive_period_in_seconds=3600, #warmpool，为下一次训练保持机器&镜像（滚动续期，最大1hour）；需要开quota。
                      disable_profiler=True,
                      debugger_hook_config=False)


# # data in channel will be automatically copied to each node - /opt/ml/input/data/train1
# # should change data_path param to above path in torchrun
# input_channel = {'train1': 's3://llm-artifacts-us-east-1/datasets/alpaca-coig-mix/'}
# estimator.fit(input_channel)

estimator.fit()

Using provided s3_resource


INFO:sagemaker:Creating training-job with name: multi-node-alpaca-train-2023-07-31-08-23-11-394


2023-07-31 08:23:16 Starting - Starting the training job......
2023-07-31 08:23:53 Starting - Preparing the instances for training.............[34m1%|          | 7/609 [02:24<3:24:59, 20.43s/it][0m
[34m1%|▏         | 8/609 [02:44<3:24:27, 20.41s/it][0m
[34m1%|▏         | 9/609 [03:05<3:23:52, 20.39s/it][0m
[34m2%|▏         | 10/609 [03:25<3:23:36, 20.40s/it][0m
[34m2%|▏         | 11/609 [03:45<3:22:46, 20.35s/it][0m
[34m2%|▏         | 12/609 [04:06<3:22:48, 20.38s/it][0m
[34m2%|▏         | 13/609 [04:26<3:23:07, 20.45s/it][0m
[34m2%|▏         | 14/609 [04:46<3:21:59, 20.37s/it][0m
[34m2%|▏         | 15/609 [05:07<3:20:53, 20.29s/it][0m
[34m3%|▎         | 16/609 [05:27<3:20:27, 20.28s/it][0m
[34m3%|▎         | 17/609 [05:47<3:20:02, 20.28s/it][0m
[34m3%|▎         | 18/609 [06:07<3:19:59, 20.30s/it][0m
[34m3%|▎         | 19/609 [06:28<3:19:50, 20.32s/it][0m
[34m3%|▎         | 20/609 [06:48<3:20:06, 20.38s/it][0m
[34m3%|▎         | 21/609 [07:09<3:20:07, 20.42

In [None]:
rm -rf src