In [None]:
## Update sagemaker python sdk version
!pip install -U sagemaker

## Set code

In [1]:
!rm -rf src
!mkdir src

In [2]:
# download training script from github
!cd src && git clone https://github.com/tatsu-lab/stanford_alpaca.git

Cloning into 'stanford_alpaca'...
remote: Enumerating objects: 129, done.[K
remote: Counting objects: 100% (75/75), done.[K
remote: Compressing objects: 100% (25/25), done.[K
remote: Total 129 (delta 57), reused 50 (delta 50), pack-reused 54[K
Receiving objects: 100% (129/129), 9.14 MiB | 5.93 MiB/s, done.
Resolving deltas: 100% (62/62), done.


##### Modify Deepspeed config to save model properply.
Set ```stage3_gather_16bit_weights_on_model_save``` to ```Ture``` if necessary.

In [None]:
import json

ds_config_file = './src/stanford_alpaca/configs/default_offload_opt_param.json'
with open (ds_config_file, 'rb') as f:
    ds_config = json.load(f)
    f.close()

ds_config['zero_optimization']['stage3_gather_16bit_weights_on_model_save'] = True

with open(ds_config_file, 'w') as f:
    json.dump(ds_config, f, indent=2)
    f.close()

In [4]:
!curl -L https://github.com/peak/s5cmd/releases/download/v2.0.0/s5cmd_2.0.0_Linux-64bit.tar.gz | tar -xz

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0
100 4176k  100 4176k    0     0  18.1M      0 --:--:-- --:--:-- --:--:-- 18.1M


In [3]:
!mv s5cmd src/
!mv entry.py src/
!mv requirements.txt src/
!mv train.sh src/
## Replace original train.py
!mv train.py src/stanford_alpaca/

## Optional - Put data to s3

In [8]:
#!./s5cmd sync <source_path> <destination_path>
!aws s3 cp ./src/stanford_alpaca/alpaca_data.json s3://llm-artifacts-us-east-1/datasets/alpaca-coig-mix/

upload: src/stanford_alpaca/alpaca_data.json to s3://llm-artifacts-us-east-1/datasets/alpaca-coig-mix/alpaca_data.json


## Launch training

In [None]:
import sagemaker
import boto3
from sagemaker import get_execution_role

sess = sagemaker.Session()
role = get_execution_role()
sagemaker_default_bucket = sess.default_bucket()
region = sess.boto_session.region_name

In [None]:
import time
from sagemaker.estimator import Estimator

## pre-built docker in https://github.com/aws/deep-learning-containers/blob/master/available_images.md
# image_uri = '763104351884.dkr.ecr.us-east-1.amazonaws.com/pytorch-training:2.0.0-gpu-py310-cu118-ubuntu20.04-sagemaker'
image_uri = '763104351884.dkr.ecr.us-east-1.amazonaws.com/pytorch-training:1.13.1-gpu-py39-cu117-ubuntu20.04-sagemaker'
# image_uri = "763104351884.dkr.ecr.us-east-1.amazonaws.com/huggingface-pytorch-training:1.13.1-transformers4.26.0-gpu-py39-cu117-ubuntu20.04"

instance_count = 2
instance_type = 'ml.p4d.24xlarge' ## p4d - 8*40G / p4de - 8*80G

environment = {
    'NODE_NUMBER':str(instance_count),
    'MODEL_S3_PATH': 's3://llm-artifacts-us-east-1/bloke-llama2-7b-fp16/*', # source model files
    'OUTPUT_MODEL_S3_PATH': 's3://llm-artifacts-us-east-1/output-models/bloke-llama2-7b-fp16/', # destination
}

estimator = Estimator(role=role,
                      entry_point='entry.py',
                      source_dir='./src',
                      base_job_name='multi-node-alpaca-train',
                      instance_count=instance_count,
                      instance_type=instance_type,
                      image_uri=image_uri,
                      environment=environment,
                      max_run=2*24*3600, #任务最大存续时间，默认2day，需要提交ticket提升quota最大28天
                      keep_alive_period_in_seconds=3600, #warmpool，为下一次训练保持机器&镜像（滚动续期，最大1hour）；需要开quota。
                      disable_profiler=True,
                      debugger_hook_config=False)


# # data in channel will be automatically copied to each node - /opt/ml/input/data/train1
# # should change data_path param to above path in torchrun
# input_channel = {'train1': 's3://llm-artifacts-us-east-1/datasets/alpaca-coig-mix/'}
# estimator.fit(input_channel)

estimator.fit()

INFO:botocore.credentials:Found credentials from IAM Role: BaseNotebookInstanceEc2InstanceRole


Using provided s3_resource


INFO:sagemaker:Creating training-job with name: multi-node-alpaca-2023-07-19-15-26-59-930


2023-07-19 15:27:04 Starting - Starting the training job..[34mbash: cannot set terminal process group (-1): Inappropriate ioctl for device[0m
[34mbash: no job control in this shell[0m
[34m2023-07-19 15:27:15,133 sagemaker-training-toolkit INFO     Imported framework sagemaker_pytorch_container.training[0m
[34m2023-07-19 15:27:15,196 sagemaker-training-toolkit INFO     No Neurons detected (normal if no neurons installed)[0m
[34m2023-07-19 15:27:15,205 sagemaker_pytorch_container.training INFO     Block until all host DNS lookups succeed.[0m
[34m2023-07-19 15:27:15,207 sagemaker_pytorch_container.training INFO     Invoking user training script.[0m
[34m2023-07-19 15:27:15,767 sagemaker-training-toolkit INFO     Installing dependencies from requirements.txt:[0m
[34m/opt/conda/bin/python3.9 -m pip install -r requirements.txt[0m
[34mCollecting rouge_score (from -r requirements.txt (line 2))[0m
[34mDownloading rouge_score-0.1.2.tar.gz (17 kB)[0m
[34mPreparing metadata (se