# 权限配置

In [13]:
import sagemaker
import os
sess = sagemaker.Session()
role = sagemaker.get_execution_role()

print(f"sagemaker role arn: {role}")
print(f"sagemaker bucket: {sess.default_bucket()}")
print(f"sagemaker session region: {sess.boto_region_name}")

sagemaker role arn: arn:aws:iam::847380964353:role/spot-bot-SpotSageMakerExecutionRole-TP8BLT3Z5JJL
sagemaker bucket: sagemaker-us-west-2-847380964353
sagemaker session region: us-west-2


# 数据准备

In [14]:
# dataset used
dataset_name = 'amazon_review_t5'
# s3 key prefix for the data
s3_prefix = 'datasets/amazon_review_t5'
WORK_DIRECTORY = './data/'
data_location = sess.upload_data(WORK_DIRECTORY, key_prefix=s3_prefix)
data_location

's3://sagemaker-us-west-2-847380964353/datasets/amazon_review_t5'

# 超参数定义

In [15]:
from sagemaker.huggingface import HuggingFace
from sagemaker.huggingface import TrainingCompilerConfig

#speed up use sagemaker compiler https://towardsdatascience.com/speed-up-hugging-face-training-jobs-on-aws-by-up-to-50-with-sagemaker-training-compiler-9ad2ac5b0eb

# hyperparameters which are passed to the training job
hyperparameters={'reference_column':'customer_review',
                 'hypothesis_column':'category',
                 'train_file':'/opt/ml/input/data/train/train.csv',
                 'validation_file':'/opt/ml/input/data/validation/validation.csv',
                 'test_file':'/opt/ml/input/data/test/test.csv',
                 'output_dir':'/opt/ml/model',
                 'do_train':True,
                 'do_eval':True,
                 'max_source_length': 128,
                 'max_target_length': 128,
                 'model_name_or_path': 't5-base',
                 'learning_rate': 3e-4,
                 'num_train_epochs': 10,
                 'per_device_train_batch_size': 2,#16
                 'gradient_accumulation_steps':2, 
                 'save_strategy':'steps',
                 'evaluation_strategy':'epoch',
                 'save_total_limit':1,
                 'eval_steps':5000
                 }
distribution = {'smdistributed':{'dataparallel':{ 'enabled': True }}}
# create the Estimator
huggingface_estimator = HuggingFace(
        entry_point='run_train.py',
        source_dir='./scripts',
        instance_type='ml.p3dn.24xlarge',#'ml.p3dn.24xlarge'
        instance_count=1,
        role=role,
        max_run=24*60*60,
        transformers_version='4.6',
        pytorch_version='1.7',
        py_version='py36',
        volume_size=128,
        #compiler_config=TrainingCompilerConfig(),
        hyperparameters = hyperparameters,
#         distribution=distribution
)

# 模型训练

In [None]:
huggingface_estimator.fit(
  {'train': data_location+'/train.csv',
   'test': data_location+'/test.csv',
   'validation': data_location+'/validation.csv',
  },
 job_name='train-amazon-review-t5-base-10epoch-stepeval'
)

2022-04-29 08:45:03 Starting - Starting the training job...
2022-04-29 08:45:27 Starting - Preparing the instances for trainingProfilerReport-1651221902: InProgress
........