In [None]:
# install once
# !pip install -U boto3 sagemaker awscli
# restart jupyter kernel

In [1]:
import sagemaker
from sagemaker import get_execution_role

sess = sagemaker.Session()
# role = get_execution_role()
role = 'arn:aws:iam::633205212955:role/service-role/AmazonSageMaker-ExecutionRole-20220923T160810'
sagemaker_default_bucket = sess.default_bucket()
region = sess.boto_session.region_name



sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/ubuntu/.config/sagemaker/config.yaml


In [None]:
from sagemaker.pytorch import PyTorch
from sagemaker.estimator import Estimator

# https://github.com/aws/deep-learning-containers/blob/master/available_images.md
# image_uri = f'763104351884.dkr.ecr.{region}.amazonaws.com/pytorch-training:2.2.0-gpu-py310-cu121-ubuntu20.04-sagemaker'
image_uri = f'763104351884.dkr.ecr.{region}.amazonaws.com/pytorch-training:2.3.0-gpu-py311-cu121-ubuntu20.04-sagemaker'
# image_uri = f'763104351884.dkr.ecr.{region}.amazonaws.com/pytorch-training:2.4.0-gpu-py311-cu124-ubuntu22.04-sagemaker'
# image_uri = f'763104351884.dkr.ecr.{region}.amazonaws.com/pytorch-training:2.5.1-gpu-py311-cu124-ubuntu22.04-sagemaker'

# instance_type = "ml.g5.2xlarge"    # 1 * A10g (24G/GPU)
# instance_type = "ml.g5.12xlarge"     # 4 * A10g (24G/GPU)
# instance_type = "ml.g5.48xlarge"    # 8 * A10g (24G/GPU)
# instance_type = "ml.p4d.24xlarge"   # 8 * A100 (40G/GPU)
instance_type = "ml.p5.48xlarge"    # 8 * H100 (80G/GPU)
# instance_type = "ml.g6e.48xlarge"    # 8 * H100 (80G/GPU)
# instance_type = "ml.p3dn.24xlarge"    # 8 * A10g (24G/GPU)

instance_count = 2                  # 1 or Multi-node

llamafactory_yaml = 'llama3_full_dpo_z2_1_4'
envs = {
    # "DATA_S3_PATH": f's3://{sagemaker_default_bucket}/qwen2-train-dataset/*',
    # 'MODEL_ID_OR_S3_PATH': f's3://llm-artifacts-us-east-1/MTLM-llama-3-8b-instruct/*', 
    'MODEL_ID_OR_S3_PATH': f's3://llm-artifacts-us-east-1/Llama-3.2-3B-Instruct/*',
    'MODEL_SAVE_PATH_S3': f's3://{sagemaker_default_bucket}/output-model/241201/',
    'CONF_YAML_NAME': f'{llamafactory_yaml}.yaml'
}

hypers = {
}

smp_estimator = Estimator(role=role,
    sagemaker_session=sess,
    base_job_name=f'inst-{instance_count}'+llamafactory_yaml.replace('_','-'),
    entry_point="estimator_entry.py",
    source_dir='submit_src/',
    instance_type=instance_type,
    instance_count=instance_count,
    environment=envs,
    hyperparameters=hypers,
    image_uri=image_uri,
    max_run=7200,
    keep_alive_period_in_seconds=1800,
    enable_remote_debug=True,
    disable_output_compression=True,
)

# smp_estimator.fit()

In [None]:
from sagemaker.estimator import Estimator
image_uri = f'763104351884.dkr.ecr.{region}.amazonaws.com/pytorch-training:2.3.0-gpu-py311-cu121-ubuntu20.04-sagemaker'

import boto3
logs_client = boto3.client('logs')
log_group_name = "/aws/sagemaker/TrainingJobs"

from config_gens import *

# 变量定义
variables = {
    'zero_conf': ["ds_z1","ds_z2","ds_z3","ds_z2_offload","ds_z3_offload"],
    'micro_bs': [1,2,4,8],
    'accum_steps': [2,4,8]
}

# 初始化生成器
generator = ConfigGenerator('submit_src/llama3_full_dpo_template.yaml')

# 生成配置文件
configs = generator.generate_configs(
    variables=variables,
    output_dir='submit_src/configs/',
    filename_template="genConf_{zero_conf}_mbs{micro_bs}_acm{accum_steps}.yaml"
)

configs_dict = {i: value for i, value in enumerate(configs)}


GLB_BS = 64

for inst in [1]:
    for config_i in configs_dict.keys():
        
        config = configs_dict[config_i]

        gen_bs = config['params']['micro_bs']
        gen_accum = config['params']['accum_steps']

        if inst*8*gen_bs*gen_accum != GLB_BS:
            continue
        
        # skip list
        skip_list = [## z1 and big bs
                     'z1_mbs8', 
                     'z2_mbs1', 
                     ## test done
                     'z3_mbs1',
                     ## will be slower
                     'z2_offload_mbs1',
                     'z3_offload_mbs1',
                     ## oom

                     ## done
                     'z1_mbs1_acm8'
                     ]
        # for sl in skip_list:
        #     if sl not in config['conf_file_name']:
        #         continue

        if any(sl in config['conf_file_name'] for sl in skip_list):
            continue

        print('---------PROGRESS---------: ', config_i)
        print('---config---:', config)
        namestr = f'inst{inst}-' + config['conf_file_name'].replace('genConf_ds_','').replace('.yaml','').replace('_','-')
        print('---JOB NAME---:', namestr)

        envs = {
            # "DATA_S3_PATH": f's3://{sagemaker_default_bucket}/qwen2-train-dataset/*',
            # 'MODEL_ID_OR_S3_PATH': f's3://llm-artifacts-us-east-1/MTLM-llama-3-8b-instruct/*', 
            'MODEL_ID_OR_S3_PATH': f's3://llm-artifacts-us-east-1/Llama-3.2-3B-Instruct/*',
            'MODEL_SAVE_PATH_S3': f's3://{sagemaker_default_bucket}/output-model/241201/',
            'CONF_YAML_NAME': f'''configs/{config['conf_file_name']}.yaml'''
        }

        instance_type = "ml.p5.48xlarge"
        
        smp_estimator = Estimator(role=role,
            sagemaker_session=sess,
            base_job_name=namestr,
            entry_point="estimator_entry.py",
            source_dir='submit_src/',
            instance_type=instance_type,
            instance_count=inst,
            environment=envs,
            hyperparameters={},
            image_uri=image_uri,
            max_run=7200,
            keep_alive_period_in_seconds=1800,
            enable_remote_debug=True,
            disable_output_compression=True,
        )

        # smp_estimator.fit()

        try:
            smp_estimator.fit()
        except Exception as e:
            print('---training job breaks---')
            print(e)
            continue

        # break

        training_job_name = smp_estimator.latest_training_job.job_name
        print('---_current_job_name---:', training_job_name)
        
        response = logs_client.describe_log_streams(
            logGroupName=log_group_name,
            logStreamNamePrefix=training_job_name
        )

        with open(f'smest_logs/log-{namestr}.logs', 'w') as f:
            for stream in response['logStreams']:
                log_stream_name = stream['logStreamName']
                logs = logs_client.get_log_events(
                    logGroupName=log_group_name,
                    logStreamName=log_stream_name
                )
                
                for event in logs['events']:
                    print(event['message'])
                    f.write(event['message'] + '\n')



---------PROGRESS---------:  2
---config---: {'path': 'submit_src/configs/genConf_ds_z1_mbs1_acm8.yaml', 'conf_file_name': 'genConf_ds_z1_mbs1_acm8.yaml', 'params': {'zero_conf': 'ds_z1', 'micro_bs': 1, 'accum_steps': 8}}
---JOB NAME---: inst1-z1-mbs1-acm8


2024-12-10 13:13:30 Starting - Starting the training job...
2024-12-10 13:13:44 Pending - Training job waiting for capacity.........
2024-12-10 13:15:15 Pending - Preparing the instances for training................................................
2024-12-10 13:23:34 Downloading - Downloading input data......
2024-12-10 13:24:10 Downloading - Downloading the training image..................
2024-12-10 13:27:12 Training - Training image download completed. Training in progress...bash: cannot set terminal process group (-1): Inappropriate ioctl for device
bash: no job control in this shell
  "cipher": algorithms.TripleDES,
  "class": algorithms.TripleDES,
2024-12-10 13:27:53,362 sagemaker-training-toolkit INFO     Imported framework sagemaker_pytorch_container.training
2024-12-10 13:27:53,629 sagemaker-training-toolkit INFO     No Neurons detected (normal if no neurons installed)
2024-12-10 13:27:53,640 sagemaker_pytorch_container.training INFO     Block until all host DNS lookups succee

2024-12-10 13:30:36 Starting - Starting the training job
2024-12-10 13:30:36 Pending - Training job waiting for capacity.........
2024-12-10 13:32:00 Pending - Preparing the instances for training................................................
2024-12-10 13:39:58 Downloading - Downloading the training image..............

In [None]:
llama3_full_dpo_z2_1_8, inst2, 4.33s/it


llama3_full_dpo_z2_1_4, inst2, 2.10s/it
llama3_full_dpo_z2_1_8, inst1, 3.70s/it

llama3_full_dpo_z1_1_4, inst2, queue1
llama3_full_dpo_z1_1_8, inst1, queue2


llama3_full_dpo_z3_2_2, inst2, queue3