In [None]:
## Update sagemaker python sdk version
!pip install -U sagemaker

## Set code

In [1]:
!rm -rf src
!mkdir src

In [2]:
# download training script from github
!cd src && git clone https://github.com/tatsu-lab/stanford_alpaca.git

Cloning into 'stanford_alpaca'...
remote: Enumerating objects: 129, done.[K
remote: Total 129 (delta 0), reused 0 (delta 0), pack-reused 129[K
Receiving objects: 100% (129/129), 9.15 MiB | 34.81 MiB/s, done.
Resolving deltas: 100% (62/62), done.


##### Modify Deepspeed config to save model properply.
Set ```stage3_gather_16bit_weights_on_model_save``` to ```Ture``` if necessary.

In [3]:
import json

ds_config_file = './src/stanford_alpaca/configs/default_offload_opt_param.json'
with open (ds_config_file, 'rb') as f:
    ds_config = json.load(f)
    f.close()

ds_config['zero_optimization']['stage3_gather_16bit_weights_on_model_save'] = True

with open(ds_config_file, 'w') as f:
    json.dump(ds_config, f, indent=2)
    f.close()

In [4]:
!curl -L https://github.com/peak/s5cmd/releases/download/v2.0.0/s5cmd_2.0.0_Linux-64bit.tar.gz | tar -xz s5cmd

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0
100 4176k  100 4176k    0     0  21.0M      0 --:--:-- --:--:-- --:--:-- 21.0M


## Optional - Put data to s3

In [None]:
#!./s5cmd sync <source_path> <destination_path>
!aws s3 cp ./alpaca_data_2.json s3://llm-artifacts-us-east-1/datasets/alpaca-coig-mix-1016/

## Launch training

In [1]:
import sagemaker
import boto3
from sagemaker import get_execution_role

sess = sagemaker.Session()
role = get_execution_role()
sagemaker_default_bucket = sess.default_bucket()
region = sess.boto_session.region_name

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/ec2-user/.config/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/ec2-user/.config/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/ec2-user/.config/sagemaker/config.yaml


In [14]:
%%writefile src/stanford_alpaca/configs/default_offload_opt_param.json
{
	"bf16": {
		"enabled": "auto"
	},
	"optimizer": {
		"type": "AdamW",
		"params": {
			"lr": "auto",
			"betas": "auto",
			"eps": "auto",
			"weight_decay": "auto"
		}
	},
	"scheduler": {
		"type": "WarmupDecayLR",
		"params": {
			"total_num_steps": "auto",
			"warmup_min_lr": "auto",
			"warmup_max_lr": "auto",
			"warmup_num_steps": "auto"
		}
	},
	"zero_optimization": {
		"stage": 1,
		"offload_optimizer": {
			"device": "cpu",
			"pin_memory": true
		},
		"offload_param": {
			"device": "cpu",
			"pin_memory": true
		},
		"overlap_comm": true,
		"contiguous_gradients": true,
		"reduce_scatter": true,
		"reduce_bucket_size": 5e8,
		"allgather_bucket_size": 5e8
	},
	"train_micro_batch_size_per_gpu": "auto",
	"gradient_accumulation_steps": "auto",
	"train_batch_size": "auto",
	"gradient_clipping": "auto",
	"steps_per_print": 10,
	"wall_clock_breakdown": false
}


Overwriting src/stanford_alpaca/configs/default_offload_opt_param.json


In [3]:
!cp s5cmd src/
!cp entry.py src/
!cp requirements.txt src/
!cp train.sh src/
## Replace original train.py
!cp train.py src/stanford_alpaca/
!cp train-stream.py src/stanford_alpaca/
!cp train-stream-wiki.py src/stanford_alpaca/
!cp train-batch-filelist.py src/stanford_alpaca/

!cp default_offload_opt_param.json src/stanford_alpaca/configs/

In [4]:
import time
from sagemaker.estimator import Estimator

## pre-built docker in https://github.com/aws/deep-learning-containers/blob/master/available_images.md
image_uri = '763104351884.dkr.ecr.us-east-1.amazonaws.com/pytorch-training:1.13.1-gpu-py39-cu117-ubuntu20.04-sagemaker'

instance_count = 1
instance_type = 'ml.p4d.24xlarge' ## p4d - 8*40G / p4de - 8*80G

environment = {
    'TOTAL_NUM_SAMPLES': str(3000),
    'NODE_NUMBER': str(instance_count),
    'DATA_S3_PATH': 's3://llm-artifacts-us-east-1/datasets/wiki-zh/*',
    'MODEL_S3_PATH': 's3://llm-artifacts-us-east-1/bloke-llama2-7b-fp16/*', # source model files
    'OUTPUT_MODEL_S3_PATH': 's3://llm-artifacts-us-east-1/output-models/llama2-7b-fp16/', # destination
}

estimator = Estimator(role=role,
                      entry_point='entry.py',
                      source_dir='./src',
                      base_job_name='stream-alpaca-train',
                      instance_count=instance_count,
                      instance_type=instance_type,
                      image_uri=image_uri,
                      environment=environment,
                      input_mode='FastFile',
                      max_run=2*24*3600, #任务最大存续时间，默认2day，需要提交ticket提升quota最大28天
                      keep_alive_period_in_seconds=3600, #warmpool，为下一次训练保持机器&镜像（滚动续期，最大1hour）；需要开quota。
                      disable_profiler=True,
                      debugger_hook_config=False)


# # data in channel will be automatically copied to each node - /opt/ml/input/data/train1
# # should change data_path param to above path in torchrun
# input_channel = {'train1': 's3://llm-artifacts-us-east-1/datasets/alpaca-coig-mix-1016/'}
# estimator.fit(input_channel)

estimator.fit()

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/ec2-user/.config/sagemaker/config.yaml
Using provided s3_resource


INFO:sagemaker:Creating training-job with name: stream-alpaca-train-2023-10-23-12-00-38-865


2023-10-23 12:00:43 Starting - Starting the training job...
2023-10-23 12:00:52 Downloading - Downloading input data
2023-10-23 12:00:52 Training - Training image download completed. Training in progress...[34mbash: cannot set terminal process group (-1): Inappropriate ioctl for device[0m
[34mbash: no job control in this shell[0m
[34m2023-10-23 12:01:16,531 sagemaker-training-toolkit INFO     Imported framework sagemaker_pytorch_container.training[0m
[34m2023-10-23 12:01:16,589 sagemaker-training-toolkit INFO     No Neurons detected (normal if no neurons installed)[0m
[34m2023-10-23 12:01:16,598 sagemaker_pytorch_container.training INFO     Block until all host DNS lookups succeed.[0m
[34m2023-10-23 12:01:16,600 sagemaker_pytorch_container.training INFO     Invoking user training script.[0m
[34m2023-10-23 12:01:17,974 sagemaker-training-toolkit INFO     Installing dependencies from requirements.txt:[0m
[34m/opt/conda/bin/python3.9 -m pip install -r requirements.txt[0m


In [None]:
9%|▉         | 1/11 [00:00<00:03,  2.86it/s]
18%|█▊        | 2/11 [00:00<00:02,  4.14it/s]
27%|██▋       | 3/11 [00:00<00:01,  4.84it/s]
36%|███▋      | 4/11 [00:00<00:01,  5.27it/s]
45%|████▌     | 5/11 [00:01<00:01,  5.54it/s]
55%|█████▍    | 6/11 [00:01<00:00,  5.71it/s]
64%|██████▎   | 7/11 [00:01<00:00,  5.83it/s]
73%|███████▎  | 8/11 [00:21<00:19,  6.62s/it]
82%|████████▏ | 9/11 [00:22<00:09,  4.71s/it]
91%|█████████ | 10/11 [00:22<00:03,  3.30s/it]
100%|██████████| 11/11 [00:22<00:00,  2.34s/it]
{'train_runtime': 23.7725, 'train_samples_per_second': 118.456, 'train_steps_per_second': 0.463, 'train_loss': 0.15704900568181818, 'epoch': 11.0}
-------DD-------Train1:  1698053860.6141694
-------DD-------Train2:  1698053972.2454438