In [None]:
## Update sagemaker python sdk version
!pip install -U sagemaker

## Set code

In [1]:
!rm -rf src
!mkdir src

In [2]:
# download training script from github
!cd src && git clone https://github.com/tatsu-lab/stanford_alpaca.git

Cloning into 'stanford_alpaca'...
remote: Enumerating objects: 129, done.[K
remote: Total 129 (delta 0), reused 0 (delta 0), pack-reused 129[K
Receiving objects: 100% (129/129), 9.15 MiB | 34.81 MiB/s, done.
Resolving deltas: 100% (62/62), done.


##### Modify Deepspeed config to save model properply.
Set ```stage3_gather_16bit_weights_on_model_save``` to ```Ture``` if necessary.

In [3]:
import json

ds_config_file = './src/stanford_alpaca/configs/default_offload_opt_param.json'
with open (ds_config_file, 'rb') as f:
    ds_config = json.load(f)
    f.close()

ds_config['zero_optimization']['stage3_gather_16bit_weights_on_model_save'] = True

with open(ds_config_file, 'w') as f:
    json.dump(ds_config, f, indent=2)
    f.close()

In [4]:
!curl -L https://github.com/peak/s5cmd/releases/download/v2.0.0/s5cmd_2.0.0_Linux-64bit.tar.gz | tar -xz s5cmd

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0
100 4176k  100 4176k    0     0  21.0M      0 --:--:-- --:--:-- --:--:-- 21.0M


## Optional - Put data to s3

In [None]:
#!./s5cmd sync <source_path> <destination_path>
!aws s3 cp ./alpaca_data_2.json s3://llm-artifacts-us-east-1/datasets/alpaca-coig-mix-1016/

## Launch training

In [23]:
import sagemaker
import boto3
from sagemaker import get_execution_role

sess = sagemaker.Session()
role = get_execution_role()
sagemaker_default_bucket = sess.default_bucket()
region = sess.boto_session.region_name

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/ec2-user/.config/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/ec2-user/.config/sagemaker/config.yaml


In [27]:
# MODE Configs
# train_mode = 'STREAM' # STREAM / BATCH
train_filename = 'train-stream-batch-wiki.py'
ds_config_filename = 'ds-stage-3-config.json'

In [32]:
!cp s5cmd src/
!cp entry.py src/
!cp requirements.txt src/
!cp train.sh src/

!cp {train_filename} src/stanford_alpaca/
!cp {ds_config_filename} src/stanford_alpaca/configs/

In [None]:
import time
from sagemaker.estimator import Estimator

## pre-built docker in https://github.com/aws/deep-learning-containers/blob/master/available_images.md
image_uri = '763104351884.dkr.ecr.us-east-1.amazonaws.com/pytorch-training:1.13.1-gpu-py39-cu117-ubuntu20.04-sagemaker'

instance_count = 1
instance_type = 'ml.g5.48xlarge' ## p4d - 8*40G / p4de - 8*80G

environment = {
    'TOTAL_NUM_SAMPLES': str(6000),
    'NODE_NUMBER': str(instance_count),
    'TRAIN_FILENAME': train_filename,
    'DS_CONFIG_FILENAME': ds_config_filename,
    # 'DATA_S3_PATH': 's3://llm-artifacts-us-east-1/datasets/wiki-zh/*',
    'MODEL_S3_PATH': 's3://llm-artifacts-us-east-1/bloke-llama2-7b-fp16/*', # source model files
    'OUTPUT_MODEL_S3_PATH': 's3://llm-artifacts-us-east-1/output-models/llama2-7b-fp16/', # destination
}

estimator = Estimator(role=role,
                      entry_point='entry.py',
                      source_dir='./src',
                      base_job_name='stream-batch-alpaca-train',
                      instance_count=instance_count,
                      instance_type=instance_type,
                      image_uri=image_uri,
                      environment=environment,
                      input_mode='FastFile',
                      max_run=2*24*3600,                 # 任务最大存续时间，默认2day，需要提交ticket提升quota至最大28天
                      keep_alive_period_in_seconds=3600, # Warmpool，为下一次训练保持机器&镜像（滚动续期，最大1hour）；需要提升对应Limit
                      disable_profiler=True,
                      debugger_hook_config=False)


# # data in channel will be automatically copied to each node - /opt/ml/input/data/train1
# # should change data_path param to above path in torchrun

input_channel = {'train1': 's3://llm-artifacts-us-east-1/datasets/wiki-zh/'}
estimator.fit(input_channel)

# estimator.fit()

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/ec2-user/.config/sagemaker/config.yaml
Using provided s3_resource


INFO:sagemaker:Creating training-job with name: stream-batch-alpaca-train-2023-10-27-15-10-14-359


2023-10-27 15:10:18 Starting - Starting the training job...
2023-10-27 15:10:34 Downloading - Downloading input data
2023-10-27 15:10:34 Training - Training image download completed. Training in progress.[34mbash: cannot set terminal process group (-1): Inappropriate ioctl for device[0m
[34mbash: no job control in this shell[0m
[34m2023-10-27 15:10:34,857 sagemaker-training-toolkit INFO     Imported framework sagemaker_pytorch_container.training[0m
[34m2023-10-27 15:10:34,918 sagemaker-training-toolkit INFO     No Neurons detected (normal if no neurons installed)[0m
[34m2023-10-27 15:10:34,930 sagemaker_pytorch_container.training INFO     Block until all host DNS lookups succeed.[0m
[34m2023-10-27 15:10:34,932 sagemaker_pytorch_container.training INFO     Invoking user training script.[0m
[34m2023-10-27 15:10:36,418 sagemaker-training-toolkit INFO     Installing dependencies from requirements.txt:[0m
[34m/opt/conda/bin/python3.9 -m pip install -r requirements.txt[0m
[3

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/ec2-user/.config/sagemaker/config.yaml
Using provided s3_resource


INFO:sagemaker:Creating training-job with name: stream-batch-alpaca-train-2023-10-26-10-07-45-257


2023-10-26 10:07:49 Starting - Starting the training job......
2023-10-26 10:08:27 Starting - Preparing the instances for training.........
2023-10-26 10:10:01 Downloading - Downloading input data...
2023-10-26 10:10:31 Training - Downloading the training image..................
2023-10-26 10:13:37 Training - Training image download completed. Training in progress.......[34mbash: cannot set terminal process group (-1): Inappropriate ioctl for device[0m
[34mbash: no job control in this shell[0m
[34m2023-10-26 10:14:32,971 sagemaker-training-toolkit INFO     Imported framework sagemaker_pytorch_container.training[0m
[34m2023-10-26 10:14:33,031 sagemaker-training-toolkit INFO     No Neurons detected (normal if no neurons installed)[0m
[34m2023-10-26 10:14:33,042 sagemaker_pytorch_container.training INFO     Block until all host DNS lookups succeed.[0m
[34m2023-10-26 10:14:33,045 sagemaker_pytorch_container.training INFO     Invoking user training script.[0m
[34m2023-10-26 10: