# 准备训练环境

In [1]:
!pip install -U sagemaker
!pip install transformers



In [2]:
import sagemaker
from sagemaker import get_execution_role
from sagemaker.pytorch import PyTorch
from sagemaker.utils import name_from_base

sess = sagemaker.Session()
role = get_execution_role()
sagemaker_default_bucket = sess.default_bucket()
region = sess.boto_session.region_name

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/ec2-user/.config/sagemaker/config.yaml


# 准备数据和预训练模型上传s3

In [3]:
# save train_dataset to s3
training_data_path = f's3://{sagemaker_default_bucket}/qwen3-vl/train/'
training_pretrain_path = f's3://{sagemaker_default_bucket}/fordeal/pretrain/'
training_output_path = f's3://{sagemaker_default_bucket}/fordeal/output/'
print("training dataset s3 path:", training_data_path)
print("training pretrain model s3 path:", training_pretrain_path)
print("training output s3 path:", training_output_path)

training dataset s3 path: s3://sagemaker-us-west-2-101054729273/qwen3-vl/train/
training pretrain model s3 path: s3://sagemaker-us-west-2-101054729273/fordeal/pretrain/
training output s3 path: s3://sagemaker-us-west-2-101054729273/fordeal/output/


In [4]:
!aws s3 cp --recursive data "{training_data_path}data"

upload: data/coco_caption2017_1k.json to s3://sagemaker-us-west-2-101054729273/qwen3-vl/train/data/coco_caption2017_1k.json
upload: data/images/000000001761.jpg to s3://sagemaker-us-west-2-101054729273/qwen3-vl/train/data/images/000000001761.jpg
upload: data/images/000000006818.jpg to s3://sagemaker-us-west-2-101054729273/qwen3-vl/train/data/images/000000006818.jpg
upload: data/images/000000002532.jpg to s3://sagemaker-us-west-2-101054729273/qwen3-vl/train/data/images/000000002532.jpg
upload: data/images/000000002149.jpg to s3://sagemaker-us-west-2-101054729273/qwen3-vl/train/data/images/000000002149.jpg
upload: data/images/000000003661.jpg to s3://sagemaker-us-west-2-101054729273/qwen3-vl/train/data/images/000000003661.jpg
upload: data/images/000000003845.jpg to s3://sagemaker-us-west-2-101054729273/qwen3-vl/train/data/images/000000003845.jpg
upload: data/images/000000003156.jpg to s3://sagemaker-us-west-2-101054729273/qwen3-vl/train/data/images/000000003156.jpg
upload: data/images/00

In [5]:
!huggingface-cli download Qwen/Qwen3-VL-4B-Instruct --local-dir Qwen3-VL-4B-Instruct --quiet

/home/ec2-user/SageMaker/efs_data/workspace/Qwen3-VL-SageMaker-MultiNode-Training/Qwen3-VL-4B-Instruct


In [6]:
!aws s3 cp --recursive Qwen3-VL-4B-Instruct "{training_pretrain_path}Qwen3-VL-4B-Instruct"

upload: Qwen3-VL-4B-Instruct/.cache/huggingface/.gitignore to s3://sagemaker-us-west-2-101054729273/fordeal/pretrain/Qwen3-VL-4B-Instruct/.cache/huggingface/.gitignore
upload: Qwen3-VL-4B-Instruct/.cache/huggingface/download/.gitattributes.metadata to s3://sagemaker-us-west-2-101054729273/fordeal/pretrain/Qwen3-VL-4B-Instruct/.cache/huggingface/download/.gitattributes.metadata
upload: Qwen3-VL-4B-Instruct/.cache/huggingface/download/.gitattributes.lock to s3://sagemaker-us-west-2-101054729273/fordeal/pretrain/Qwen3-VL-4B-Instruct/.cache/huggingface/download/.gitattributes.lock
upload: Qwen3-VL-4B-Instruct/.cache/huggingface/download/generation_config.json.metadata to s3://sagemaker-us-west-2-101054729273/fordeal/pretrain/Qwen3-VL-4B-Instruct/.cache/huggingface/download/generation_config.json.metadata
upload: Qwen3-VL-4B-Instruct/.cache/huggingface/download/config.json.metadata to s3://sagemaker-us-west-2-101054729273/fordeal/pretrain/Qwen3-VL-4B-Instruct/.cache/huggingface/download/con

In [9]:
import time
from sagemaker.pytorch import PyTorch

# define Training Job Name 
base_job_name = "qwen3-vl-4b-coco"

instance_count = 2 # 1 # 选择训练使用的实例数量
instance_type="ml.g6e.48xlarge" # "ml.p5.48xlarge" # 选择训练使用的机型
#define the enviroment variables for your scripts.
environment = {'NODE_NUMBER':str(instance_count),
               'FI_PROVIDER': 'efa',
               'NCCL_PROTO': 'simple',
               'FI_EFA_USE_DEVICE_RDMA': '1',
               'NCCL_DEBUG': 'INFO',
               'MODEL_S3_PATH': training_output_path
}

# create the Estimator
estimator = PyTorch(entry_point='entry.py',
                    source_dir='src',
                    # checkpoint_s3_uri=checkpoint_s3_path,
                    base_job_name=base_job_name,
                    role=role,
                    environment=environment,
                    framework_version='2.5.1',
                    py_version='py311',
                    script_mode=True,
                    instance_count=instance_count,
                    instance_type=instance_type,
                    output_path=training_output_path,
                    enable_remote_debug=True,
                    disable_output_compression=True,)

In [None]:
data = {
    'train': training_data_path,
    'pretrain': training_pretrain_path,
}

# starting the train job with our uploaded datasets as input
estimator.fit(data, wait=True)

INFO:sagemaker.telemetry.telemetry_logging:SageMaker Python SDK will collect telemetry to help us better understand our user's needs, diagnose issues, and deliver additional features.
To opt out of telemetry, please disable via TelemetryOptOut parameter in SDK defaults config. For more information, refer to https://sagemaker.readthedocs.io/en/stable/overview.html#configuring-and-using-defaults-with-the-sagemaker-python-sdk.
INFO:sagemaker.image_uris:image_uri is not presented, retrieving image_uri based on instance_type, framework etc.
INFO:sagemaker:Creating training-job with name: qwen3-vl-4b-coco-2025-11-06-16-48-48-085


2025-11-06 16:48:48 Starting - Starting the training job
2025-11-06 16:48:48 Pending - Training job waiting for capacity......
2025-11-06 16:49:41 Pending - Preparing the instances for training.