### SageMaker SDK Environment Prep

In [1]:
import sagemaker
from sagemaker import image_uris
import boto3
import os
import time
import json
from pathlib import Path

In [2]:
role = sagemaker.get_execution_role()  # execution role for the endpoint
sess = sagemaker.session.Session()  # sagemaker session for interacting with different AWS APIs
bucket = sess.default_bucket()  # bucket to house artifacts
model_bucket = sess.default_bucket()  # bucket to house artifacts
region = sess._region_name
account_id = sess.account_id()
s3_client = boto3.client("s3")
sm_client = boto3.client("sagemaker")
smr_client = boto3.client("sagemaker-runtime")

In [3]:
# Pull git project to notebook instance
!git clone https://github.com/OptimalScale/LMFlow.git

Cloning into 'LMFlow'...
remote: Enumerating objects: 3331, done.[K
remote: Counting objects: 100% (3310/3310), done.[K
remote: Compressing objects: 100% (1177/1177), done.[K
remote: Total 3331 (delta 1854), reused 3272 (delta 1838), pack-reused 21[K
Receiving objects: 100% (3331/3331), 21.57 MiB | 30.85 MiB/s, done.
Resolving deltas: 100% (1864/1864), done.


### Choose one dataset & run download

In [4]:
%%sh
cd LMFlow/data/
chmod +x download.sh
./download.sh MedMCQA

downloading MedMCQA


--2023-05-10 13:14:58--  http://lmflow.org:5000/MedMCQA.tar.gz
Resolving lmflow.org (lmflow.org)... 3.84.211.127
Connecting to lmflow.org (lmflow.org)|3.84.211.127|:5000... connected.
HTTP request sent, awaiting response... 200 OK
Length: 13476090 (13M) [application/gzip]
Saving to: ‘MedMCQA.tar.gz’

     0K .......... .......... .......... .......... ..........  0% 12.3M 1s
    50K .......... .......... .......... .......... ..........  0% 45.7M 1s
   100K .......... .......... .......... .......... ..........  1% 45.6M 1s
   150K .......... .......... .......... .......... ..........  1% 40.3M 0s
   200K .......... .......... .......... .......... ..........  1%  337M 0s
   250K .......... .......... .......... .......... ..........  2%  199M 0s
   300K .......... .......... .......... .......... ..........  2%  117M 0s
   350K .......... .......... .......... .......... ..........  3%  105M 0s
   400K .......... .......... .......... .......... ..........  3%  289M 0s
   450K ......

MedMCQA/
MedMCQA/train/
MedMCQA/train/train_182822.json
MedMCQA/validation/
MedMCQA/validation/valid_4183.json


### Build original Nvidia based docker

In [5]:
!mkdir LMFlow/docker-sm

In [6]:
%%writefile LMFlow/docker-sm/Dockerfile
FROM nvidia/cuda:11.3.0-cudnn8-devel-ubuntu20.04

ENV TZ=Etc/UTC
RUN ln -snf /usr/share/zoneinfo/$TZ /etc/localtime && echo $TZ > /etc/timezone

RUN apt-get update --fix-missing && apt-get install -y fontconfig --fix-missing
RUN apt-get install -y libopenmpi-dev
RUN apt-get install -y git python3.9 python3.9-dev python3.9-venv
RUN python3.9 -m venv /venv
ENV PATH=/venv/bin:$PATH
RUN pip3 install mpi4py

RUN git clone https://github.com/OptimalScale/LMFlow.git
WORKDIR /LMFlow/

RUN pip3 install wheel
RUN pip3 install sagemaker-training # add sagemaker training toolkits
RUN pip3 install -e .

Writing LMFlow/docker-sm/Dockerfile


In [None]:
%%script
#!/bin/bash

# The name of our algorithm
algorithm_name=sagemaker-demo-lmflow-v0

cd LMFlow/docker-sm

account=$(aws sts get-caller-identity --query Account --output text)

# Get the region defined in the current configuration (default to us-west-2 if none defined)
region=$(aws configure get region)
region=${region:-us-east-1}

fullname="${account}.dkr.ecr.${region}.amazonaws.com/${algorithm_name}:latest"

# If the repository doesn't exist in ECR, create it.

aws ecr describe-repositories --repository-names "${algorithm_name}" > /dev/null 2>&1

if [ $? -ne 0 ]
then
    aws ecr create-repository --repository-name "${algorithm_name}" > /dev/null
fi

# Get the login command from ECR and execute it directly
$(aws ecr get-login --region ${region} --no-include-email)

# Get the login command from ECR in order to pull down the SageMaker PyTorch image
$(aws ecr get-login --registry-ids 763104351884 --region ${region} --no-include-email)

# Build the docker image locally with the image name and then push it to ECR
# with the full name.

docker build  -t ${algorithm_name} . --build-arg REGION=${region}
docker tag ${algorithm_name} ${fullname}

docker push ${fullname}

In [8]:
# ## You should change below region code to the region you used, here sample is use us-east-1
# !aws ecr get-login-password --region us-east-1 | docker login --username AWS --password-stdin 763104351884.dkr.ecr.us-east-1.amazonaws.com

In [10]:
# find the docker image uri from above log or ECR
docker_image_uri = 'YOUR_AWS_ACCOUNT_ID.dkr.ecr.us-east-1.amazonaws.com/sagemaker-demo-lmflow-v0'

### Modify starting shell depending on aws resources required

In [11]:
%%writefile LMFlow/scripts/run_sm_train_job.sh
#!/bin/bash

deepspeed_args="--master_port=11000"      # Default argument

wandb disabled # disable wandb if nessary

LOCAL_SM_DIR=/tmp/lmflow
exp_id=finetune_v0.1
SAVE_PATH="${LOCAL_SM_DIR}/${exp_id}/model_output"
LOG_FILE="${SAVE_PATH}/logs"

BASE_CODE_PATH=/opt/ml/code/LMFlow

## If you use S3 as data source
# train_dataset_path='/opt/ml/input/data/trains'
# test_dataset_path='/opt/ml/input/data/tests'
# dataset_path=${train_dataset_path}
## else, use sample data in code path
dataset_path=${BASE_CODE_PATH}/data/MedMCQA/train

output_dir=${SAVE_PATH}
log_dir=${LOG_FILE}
mkdir -p ${output_dir} ${log_dir}

deepspeed ${deepspeed_args} \
  ${BASE_CODE_PATH}/examples/finetune.py \
    --model_name_or_path gpt2 \
    --dataset_path ${dataset_path} \
    --output_dir ${output_dir} --overwrite_output_dir \
    --num_train_epochs 0.01 \
    --learning_rate 2e-5 \
    --block_size 512 \
    --per_device_train_batch_size 1 \
    --deepspeed ${BASE_CODE_PATH}/configs/ds_config_zero3.json \
    --bf16 \
    --run_name finetune \
    --validation_split_percentage 0 \
    --logging_steps 20 \
    --do_train \
    --ddp_timeout 72000 \
    --save_steps 5000 \
    --dataloader_num_workers 1 \
    | tee ${log_dir}/train.log \
    2> ${log_dir}/train.err

Writing LMFlow/scripts/run_sm_train_job.sh


In [12]:
# remove dependency installation that already installed in dockerfile
! rm -rf LMFlow/requirements.txt
! rm -rf LMFlow/setup.py

### SageMaker core API call

In [13]:
instance_type = 'ml.p4d.24xlarge'

md_est = sagemaker.estimator.Estimator(
    entry_point="LMFlow/scripts/run_sm_train_job.sh", #'scripts/run_fintune.sh'
    role=role,
    image_uri=docker_image_uri,
    source_dir='./',
    instance_count=1,
    instance_type=instance_type,
    sagemaker_session=sess,
    # volume_size=500, # not necessary for Nvme resource
    keep_alive_period_in_seconds=1800, # hold temperarily the training resource
    max_run=24*3600, # to be increased for large number of steps
)

In [14]:
data_chnl = {'trains': 's3://YOUR_BUCKET_NAME/lmflow-data/MedMCQA/train',
           'tests': 's3://YOUR_BUCKET_NAME/lmflow-data/MedMCQA/validation'}
md_est.fit()
# md_est.fit(data_chnl)

Using provided s3_resource


INFO:sagemaker:Creating training-job with name: sagemaker-demo-lmflow-v0-2023-05-10-13-16-47-354


2023-05-10 13:16:53 Starting - Starting the training job...
2023-05-10 13:17:12 Downloading - Downloading input data
[34m== CUDA ==[0m
[34mCUDA Version 11.3.0[0m
[34mContainer image Copyright (c) 2016-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.[0m
[34mThis container image and its contents are governed by the NVIDIA Deep Learning Container License.[0m
[34mBy pulling and using the container, you accept the terms and conditions of this license:[0m
[34mhttps://developer.nvidia.com/ngc/nvidia-deep-learning-container-license[0m
[34mA copy of this license is made available in this container at /NGC-DL-CONTAINER-LICENSE for your convenience.[0m
[34m*************************[0m
[34m** DEPRECATION NOTICE! **[0m
[34m*************************[0m
[34mTHIS IMAGE IS DEPRECATED and is scheduled for DELETION.
    https://gitlab.com/nvidia/container-images/cuda/blob/master/doc/support-policy.md[0m
[34m2023-05-10 13:17:14,376 sagemaker-training-toolkit INFO     No N

KeyboardInterrupt: 