In [None]:
# install once
!pip install -U boto3 sagemaker awscli
# restart jupyter kernel

In [1]:
import sagemaker
import boto3, os
from sagemaker import get_execution_role

sess = sagemaker.Session()
role = get_execution_role()
sagemaker_default_bucket = sess.default_bucket()
region = sess.boto_session.region_name

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/ec2-user/.config/sagemaker/config.yaml


In [2]:
from sagemaker.pytorch import PyTorch
from sagemaker.estimator import Estimator

# https://github.com/aws/deep-learning-containers/blob/master/available_images.md
# image_uri = f'763104351884.dkr.ecr.{region}.amazonaws.com/pytorch-training:2.2.0-gpu-py310-cu121-ubuntu20.04-sagemaker'
image_uri = f'763104351884.dkr.ecr.{region}.amazonaws.com/pytorch-training:2.3.0-gpu-py311-cu121-ubuntu20.04-sagemaker'

instance_type = "ml.g5.2xlarge"    # 1 * A10g (24G/GPU)
# instance_type = "ml.g5.12xlarge"     # 4 * A10g (24G/GPU)
# instance_type = "ml.g5.48xlarge"    # 8 * A10g (24G/GPU)
# instance_type = "ml.p4d.24xlarge"   # 8 * A100 (40G/GPU)
# instance_type = "ml.p5.48xlarge"    # 8 * H100 (80G/GPU)

instance_count = 1                  # 1 or Multi-node

envs = {
    "DATA_S3_PATH": f's3://{sagemaker_default_bucket}/qwen2-train-dataset/*',
    'MODEL_ID_OR_S3_PATH': f's3://{sagemaker_default_bucket}/Qwen2-0.5B-Instruct/*',
    'MODEL_SAVE_PATH_S3': f's3://{sagemaker_default_bucket}/output-model/2408/'
}

hypers = {
}

smp_estimator = Estimator(role=role,
    sagemaker_session=sess,
    base_job_name='sm-qwen2-multinode',
    entry_point="estimator_entry.py",
    source_dir='submit_src/',
    instance_type=instance_type,
    instance_count=instance_count,
    environment=envs,
    hyperparameters=hypers,
    image_uri=image_uri,
    max_run=7200,
    keep_alive_period_in_seconds=60,
    enable_remote_debug=True,
    disable_output_compression=True,
)

smp_estimator.fit()

INFO:sagemaker:Creating training-job with name: sm-qwen2-multinode-2024-11-06-01-05-48-380


2024-11-06 01:05:52 Starting - Starting the training job
2024-11-06 01:05:52 Pending - Training job waiting for capacity......
2024-11-06 01:06:42 Pending - Preparing the instances for training...
2024-11-06 01:07:24 Downloading - Downloading the training image.....................
2024-11-06 01:10:47 Training - Training image download completed. Training in progress....[34mbash: cannot set terminal process group (-1): Inappropriate ioctl for device[0m
[34mbash: no job control in this shell[0m
  "cipher": algorithms.TripleDES,[0m
  "class": algorithms.TripleDES,[0m
[34m2024-11-06 01:11:14,799 sagemaker-training-toolkit INFO     Imported framework sagemaker_pytorch_container.training[0m
[34m2024-11-06 01:11:14,816 sagemaker-training-toolkit INFO     No Neurons detected (normal if no neurons installed)[0m
[34m2024-11-06 01:11:14,826 sagemaker_pytorch_container.training INFO     Block until all host DNS lookups succeed.[0m
[34m2024-11-06 01:11:14,828 sagemaker_pytorch_contain

In [3]:
!echo s3://$sagemaker_default_bucket/output-model/2408/

s3://sagemaker-us-east-1-633205212955/output-model/2408/


In [4]:
!aws s3 ls s3://$sagemaker_default_bucket/output-model/2408/

                           PRE checkpoint-10/
2024-11-06 01:13:49       1236 README.md
2024-11-06 01:13:49         80 added_tokens.json
2024-11-06 01:13:49        196 all_results.json
2024-11-06 01:13:49        729 config.json
2024-11-06 01:13:49        242 generation_config.json
2024-11-06 01:13:49    1671853 merges.txt
2024-11-06 01:13:49  988097824 model.safetensors
2024-11-06 01:13:49        367 special_tokens_map.json
2024-11-06 01:13:49   11418266 tokenizer.json
2024-11-06 01:13:49       1533 tokenizer_config.json
2024-11-06 01:13:49        196 train_results.json
2024-11-06 01:13:49        174 trainer_log.jsonl
2024-11-06 01:13:49        969 trainer_state.json
2024-11-06 01:13:49       7608 training_args.bin
2024-11-06 01:13:49    2776833 vocab.json


In [5]:
!aws s3 rm --recursive s3://$sagemaker_default_bucket/output-model/2408/checkpoint-10/

delete: s3://sagemaker-us-east-1-633205212955/output-model/2408/checkpoint-10/added_tokens.json
delete: s3://sagemaker-us-east-1-633205212955/output-model/2408/checkpoint-10/special_tokens_map.json
delete: s3://sagemaker-us-east-1-633205212955/output-model/2408/checkpoint-10/config.json
delete: s3://sagemaker-us-east-1-633205212955/output-model/2408/checkpoint-10/global_step10/zero_pp_rank_0_mp_rank_00_model_states.pt
delete: s3://sagemaker-us-east-1-633205212955/output-model/2408/checkpoint-10/latest
delete: s3://sagemaker-us-east-1-633205212955/output-model/2408/checkpoint-10/generation_config.json
delete: s3://sagemaker-us-east-1-633205212955/output-model/2408/checkpoint-10/tokenizer.json
delete: s3://sagemaker-us-east-1-633205212955/output-model/2408/checkpoint-10/rng_state.pth
delete: s3://sagemaker-us-east-1-633205212955/output-model/2408/checkpoint-10/scheduler.pt
delete: s3://sagemaker-us-east-1-633205212955/output-model/2408/checkpoint-10/tokenizer_config.json
delete: s3://sag