In [None]:
# Restart kernel after pip install
!pip install --upgrade sagemaker

In [None]:
# download any train data
!wget https://raw.githubusercontent.com/tatsu-lab/stanford_alpaca/main/alpaca_data.json

In [None]:
# download s5cmd for faster copying than 'aws s3 cp'
!curl -L https://github.com/peak/s5cmd/releases/download/v2.0.0/s5cmd_2.0.0_Linux-64bit.tar.gz | tar -xz s5cmd

In [None]:
!rm -rf src
!mkdir -p src

In [None]:
!mv s5cmd src/
!mv alpaca_data.json src/
!mv train.py src/
!mv requirements.txt src/

In [None]:
import sagemaker
import boto3
from sagemaker import get_execution_role

sess = sagemaker.Session()
role = get_execution_role()
sagemaker_default_bucket = sess.default_bucket()
region = sess.boto_session.region_name

In [None]:
import time
from sagemaker.estimator import Estimator
from datetime import datetime

# Pre-built dockers: https://github.com/aws/deep-learning-containers/blob/master/available_images.md
image_uri = '763104351884.dkr.ecr.us-east-1.amazonaws.com/pytorch-training:2.0.0-gpu-py310-cu118-ubuntu20.04-sagemaker'

instance_count = 1
instance_type = 'ml.g5.4xlarge'

ts_str = str(datetime.now().strftime("%Y-%m-%d-%H-%M-%S"))
model_output_path = f's3://{sagemaker_default_bucket}/output-models/bloke-llama2-7b-qlora/{ts_str}/' 

environment = {
    # 'NODE_NUMBER':str(instance_count),
    'MODEL_S3_PATH': f's3://{sagemaker_default_bucket}/bloke-llama2-7b-fp16/*', # source model files
    'OUTPUT_MODEL_S3_PATH': model_output_path # destination
}

hyp_param = {
    'seed':99,
    'data_dir':'/opt/ml/code/alpaca_data.json',
    'per_device_train_batch_size':1,
    'max_steps':20
}

estimator = Estimator(role=role,
                      entry_point='train.py',
                      source_dir='./src',
                      base_job_name='llama2-qlora-train',
                      instance_count=instance_count,
                      instance_type=instance_type,
                      image_uri=image_uri,
                      environment=environment,
                      hyperparameters=hyp_param,
                      max_run=2*24*3600, #任务最大存续时间，默认2day，需要提交ticket提升quota最大28天
                      keep_alive_period_in_seconds=3600, #warmpool，为下一次训练保持机器&镜像（滚动续期，最大1hour）；需要开quota。
                      disable_profiler=True,
                      debugger_hook_config=False)


# data in channel will be automatically copied to training node, e.g. /opt/ml/input/data/trainabc
# input_channel = {'trainabc': 's3://<s3_bucket>/datasets/cn_alpaca_jsonline_data/'}
# estimator.fit(input_channel)

estimator.fit()

In [None]:
# Copy the model output path to LMI option.s3url
print('PATH for LMI inference option.s3url:')
print(model_output_path)

In [None]:
!aws s3 ls {model_output_path}