### SageMaker SDK Environment Prep

In [None]:
import sagemaker
from sagemaker import image_uris
import boto3
import os
import time
import json
from pathlib import Path

In [None]:
role = sagemaker.get_execution_role()  # execution role for the endpoint
sess = sagemaker.session.Session()  # sagemaker session for interacting with different AWS APIs
bucket = sess.default_bucket()  # bucket to house artifacts
model_bucket = sess.default_bucket()  # bucket to house artifacts
region = sess._region_name
account_id = sess.account_id()
s3_client = boto3.client("s3")
sm_client = boto3.client("sagemaker")
smr_client = boto3.client("sagemaker-runtime")

In [None]:
# Pull git project to notebook instance
!git clone https://github.com/OptimalScale/LMFlow.git

### Choose one dataset & run download

In [None]:
%%sh
cd LMFlow/data/
chmod +x download.sh
./download.sh MedMCQA

### Build original Nvidia based docker

In [None]:
!mkdir LMFlow/docker-sm

In [None]:
%%writefile LMFlow/docker-sm/Dockerfile
FROM nvidia/cuda:11.3.0-cudnn8-devel-ubuntu20.04

ENV TZ=Etc/UTC
RUN ln -snf /usr/share/zoneinfo/$TZ /etc/localtime && echo $TZ > /etc/timezone

RUN apt-get update --fix-missing && apt-get install -y fontconfig --fix-missing
RUN apt-get install -y libopenmpi-dev
RUN apt-get install -y git python3.9 python3.9-dev python3.9-venv
RUN python3.9 -m venv /venv
ENV PATH=/venv/bin:$PATH
RUN pip3 install mpi4py

RUN git clone https://github.com/OptimalScale/LMFlow.git
WORKDIR /LMFlow/

RUN pip3 install wheel
RUN pip3 install sagemaker-training # add sagemaker training toolkits
RUN pip3 install -e .

In [None]:
%%writefile build_push.sh
#!/bin/bash

# The name of our algorithm
algorithm_name=sagemaker-demo-lmflow-v0

cd LMFlow/docker-sm

account=$(aws sts get-caller-identity --query Account --output text)

# Get the region defined in the current configuration (default to us-west-2 if none defined)
region=$(aws configure get region)
region=${region:-us-east-1}

fullname="${account}.dkr.ecr.${region}.amazonaws.com/${algorithm_name}:latest"

# If the repository doesn't exist in ECR, create it.

aws ecr describe-repositories --repository-names "${algorithm_name}" > /dev/null 2>&1

if [ $? -ne 0 ]
then
    aws ecr create-repository --repository-name "${algorithm_name}" > /dev/null
fi

# Get the login command from ECR and execute it directly
$(aws ecr get-login --region ${region} --no-include-email)

# Get the login command from ECR in order to pull down the SageMaker PyTorch image
$(aws ecr get-login --registry-ids 763104351884 --region ${region} --no-include-email)

# Build the docker image locally with the image name and then push it to ECR
# with the full name.

docker build  -t ${algorithm_name} . --build-arg REGION=${region}
docker tag ${algorithm_name} ${fullname}

docker push ${fullname}

In [None]:
%%sh
chmod +x build_push.sh
./build_push.sh

In [None]:
# find the docker image uri from above log or ECR
docker_image_uri = 'YOUR_ACCOUNT_ID.dkr.ecr.us-east-1.amazonaws.com/sagemaker-demo-lmflow-v0'

### Modify starting shell depending on aws resources required

In [None]:
%%writefile LMFlow/scripts/run_sm_train_job.sh
#!/bin/bash

deepspeed_args="--master_port=11000"      # Default argument

wandb disabled # disable wandb if nessary

LOCAL_SM_DIR=/tmp/lmflow
exp_id=finetune_v0.1
MODEL_SAVE_PATH="${LOCAL_SM_DIR}/model_output/${exp_id}"
SAVE_PATH="${LOCAL_SM_DIR}/${exp_id}"
LOG_FILE="${SAVE_PATH}/logs"

BASE_CODE_PATH=/opt/ml/code/LMFlow

## If you use S3 as data source
# train_dataset_path='/opt/ml/input/data/trains'
# test_dataset_path='/opt/ml/input/data/tests'
# dataset_path=${train_dataset_path}
## else, use sample data in code path
dataset_path=${BASE_CODE_PATH}/data/MedMCQA/train

output_dir=${MODEL_SAVE_PATH}
log_dir=${LOG_FILE}
mkdir -p ${output_dir} ${log_dir}


chmod +x ./s5cmd
./s5cmd sync s3://llm-artifacts-us-east-1/decapoda-research-llama-7b-hf/* ${LOCAL_SM_DIR}/model_artifacts/

# --model_name_or_path ${LOCAL_SM_DIR}/model_artifacts/ \
deepspeed ${deepspeed_args} \
  ${BASE_CODE_PATH}/examples/finetune.py \
    --model_name_or_path ${LOCAL_SM_DIR}/model_artifacts/ \
    --dataset_path ${dataset_path} \
    --output_dir ${output_dir} --overwrite_output_dir \
    --num_train_epochs 0.01 \
    --learning_rate 2e-5 \
    --block_size 512 \
    --per_device_train_batch_size 1 \
    --deepspeed ${BASE_CODE_PATH}/configs/ds_config_zero3.json \
    --bf16 \
    --run_name finetune \
    --validation_split_percentage 0 \
    --logging_steps 20 \
    --do_train \
    --ddp_timeout 72000 \
    --save_steps 5000 \
    --dataloader_num_workers 1 \
    | tee ${log_dir}/train.log \
    2> ${log_dir}/train.err


./s5cmd sync ${output_dir} s3://llm-artifacts-us-east-1/output-models/lmflow-7b/$(date +%Y-%m-%d-%H-%M-%S)/


In [None]:
# remove dependency installation that already installed in dockerfile
! rm -rf LMFlow/requirements.txt
! rm -rf LMFlow/setup.py

### SageMaker core API call

In [None]:
instance_type = 'ml.p4d.24xlarge'

md_est = sagemaker.estimator.Estimator(
    entry_point="LMFlow/scripts/run_sm_train_job.sh", #'scripts/run_fintune.sh'
    role=role,
    image_uri=docker_image_uri,
    source_dir='./',
    instance_count=1,
    instance_type=instance_type,
    sagemaker_session=sess,
    # volume_size=500, # not necessary for Nvme resource
    keep_alive_period_in_seconds=1800, # hold temperarily the training resource
    max_run=24*3600, # to be increased for large number of steps
)

In [None]:
data_chnl = {'trains': 's3://YOUR_BUCKET_NAME/lmflow-data/MedMCQA/train',
           'tests': 's3://YOUR_BUCKET_NAME/lmflow-data/MedMCQA/validation'}
md_est.fit()
# md_est.fit(data_chnl)