In [7]:
import sagemaker
import boto3
from sagemaker import get_execution_role
from omegaconf import DictConfig, OmegaConf

sess = sagemaker.Session()
role = get_execution_role()
sagemaker_default_bucket = sess.default_bucket()

account = sess.boto_session.client("sts").get_caller_identity()["Account"]
region = sess.boto_session.region_name


INFO:botocore.credentials:Found credentials from IAM Role: BaseNotebookInstanceEc2InstanceRole
INFO:botocore.credentials:Found credentials from IAM Role: BaseNotebookInstanceEc2InstanceRole


In [8]:
mpioptions = "-x NCCL_DEBUG=WARN -x SMDEBUG_LOG_LEVEL=ERROR "
mpioptions += "-x SMP_DISABLE_D2D=1 -x SMP_D2D_GPU_BUFFER_SIZE_BYTES=1 -x SMP_NCCL_THROTTLE_LIMIT=1 "
mpioptions += "-x FI_EFA_USE_DEVICE_RDMA=1 -x FI_PROVIDER=efa -x RDMAV_FORK_SAFE=1"

In [13]:
from sagemaker.estimator import Estimator
from sagemaker.pytorch import PyTorch

image_uri = '763104351884.dkr.ecr.us-east-1.amazonaws.com/pytorch-training:2.0.0-gpu-py310-cu118-ubuntu20.04-sagemaker'
# image_uri = '763104351884.dkr.ecr.us-east-1.amazonaws.com/pytorch-training:1.13.1-gpu-py39-cu117-ubuntu20.04-sagemaker'
# image_uri = '763104351884.dkr.ecr.us-east-1.amazonaws.com/huggingface-pytorch-training:1.13.1-transformers4.26.0-gpu-py39-cu117-ubuntu20.04'

cfg = OmegaConf.load('smp_llama_7b_zh_instruct_coig.yaml')

est = PyTorch(role=role,
            entry_point='smp_trainer_base_ds_mul_aws.py',
            source_dir='./',
            base_job_name='panda-llm-smp-job',
            instance_count=1,
            instance_type='ml.p4d.24xlarge',
            image_uri=image_uri,
            # framework_version='1.13.1',
            # py_version='py39',
            distribution={
                "mpi": {"enabled": True,
                        "processes_per_host": 8,
                        "custom_mpi_options": mpioptions,
                       },
                "smdistributed": {
                        "modelparallel": {
                            "enabled": True,
                            "parameters": dict(cfg.smp_init_params),
                        }
                    }
            },
            max_run=3600*24*2, # 训练任务存续的时间上限
            keep_alive_period_in_seconds=3600,
            disable_profiler=True,
            debugger_hook_config=False
            )

INFO:botocore.credentials:Found credentials from IAM Role: BaseNotebookInstanceEc2InstanceRole


In [None]:
# change to fp16=false, TBD: 1/model creation without wrapper; 2/no barrier in cache data
dat_chnl = {'train123':'s3://llm-artifacts-us-east-1/datasets/coig/'}
est.fit(dat_chnl)

INFO:sagemaker:Creating training-job with name: panda-llm-job-2023-05-26-07-50-10-248


Using provided s3_resource
2023-05-26 07:50:13 Starting - Starting the training job..[34mbash: cannot set terminal process group (-1): Inappropriate ioctl for device[0m
[34mbash: no job control in this shell[0m
[34m2023-05-26 07:50:23,135 sagemaker-training-toolkit INFO     Imported framework sagemaker_pytorch_container.training[0m
[34m2023-05-26 07:50:23,194 sagemaker-training-toolkit INFO     No Neurons detected (normal if no neurons installed)[0m
[34m2023-05-26 07:50:23,202 sagemaker_pytorch_container.training INFO     Block until all host DNS lookups succeed.[0m
[34m2023-05-26 07:50:23,203 sagemaker_pytorch_container.training INFO     Invoking user training script.[0m
[34m2023-05-26 07:50:24,662 sagemaker-training-toolkit INFO     Installing dependencies from requirements.txt:[0m
[34m/opt/conda/bin/python3.10 -m pip install -r requirements.txt[0m
[34mCollecting wandb (from -r requirements.txt (line 1))[0m
[34mDownloading wandb-0.15.3-py3-none-any.whl (2.0 MB)[0m