In [3]:
import requests
import boto3
import json
import os
import sys
import argparse
import importlib
import transformers
import torch
import pathlib
import awswrangler as wr
from IPython.display import display
from sagemaker.huggingface import HuggingFace
from sagemaker.huggingface.processing import HuggingFaceProcessor
from sagemaker.sklearn.processing import SKLearnProcessor
from sagemaker.processing import FrameworkProcessor
from sagemaker.sklearn.estimator import SKLearn
from sagemaker.workflow.steps import ProcessingStep
from sagemaker.workflow.pipeline_context import PipelineSession
from sagemaker.processing import ProcessingInput, ProcessingOutput
from sagemaker.session import get_execution_role


# Adding ../01_modules or ./01_modules to the system path so that we can load modules from 
# there as well
if '__file__' in globals():
    script_dir = pathlib.Path(__file__).parent.resolve()
else:
    script_dir = pathlib.Path().absolute()
modules_path_in_dev = os.path.abspath(os.path.join(script_dir, '..', '01_modules'))
modules_path_in_prod = os.path.abspath(os.path.join(script_dir, '01_modules'))
if os.path.exists(modules_path_in_dev):
    sys.path.append(modules_path_in_dev)
if os.path.exists(modules_path_in_prod):
    sys.path.append(modules_path_in_prod)


# # Jupyter only reads a local module the first time after 
# # kernel start. Re-running a cell with 
# # "from mymodulename import *" would not change
# # anything, even if the imported module has since changed.
# # As a workaround, we need to directly load the module, 
# # use importlib.reload to reload it and then import * 
import utils
_ = importlib.reload(utils)
import config
_ = importlib.reload(config) 


utils.py loaded: v0.2.12
config.py loaded: v0.1


In [9]:
execution_role = get_execution_role()

_ = """
ml.g6.xlarge:   6
ml.g6.2xlarge:  3
ml.g6.4xlarge:  3
ml.g6.8xlarge:  3
ml.g6.12xlarge: 3
ml.g6.16xlarge: 3
ml.g6.24xlarge: 3
ml.g6.48xlarge: 3
"""

instance_type = 'ml.g6.2xlarge'

hyperparameters={
    'runtype': 'prod',
    'instance_type': instance_type,
    'model_name': 'distilbert-base-uncased',
    'hf_dataset_suffix': '_Title_SubfieldIndex',
    'label_type': 'subfield',
    'text_key': 'title',
    'text_key_rename_to': 'text',
    'label_key_rename_to': 'label',
    'sample': 1,
    
    # 'epochs': 1,                                       # number of training epochs
    # 'train_batch_size': 32,                            # training batch size
    # 'model_name':'distilbert/distilbert-base-uncased'  # name of pretrained model
}

env_vars = {
    'HUGGINGFACE_HUB_CACHE': '/tmp/.cache'
}

huggingface_estimator = HuggingFace(
    entry_point='05_tuning_basic/05_12_tuning_basic_simple.py',                 # fine-tuning script to use in training job
    source_dir='..',                 # directory where fine-tuning script is stored
    max_run=1*60*60, # in seconds
    volume_size=300,
    instance_type=instance_type,          # instance type
    instance_count=1,                       # number of instances
    role=execution_role,                              # IAM role used in training job to acccess AWS resources (S3)
    transformers_version='4.26',             # Transformers version
    pytorch_version='1.13',                  # PyTorch version
    py_version='py39',                      # Python version
    disable_output_compression=True,
    environment=env_vars,
    hyperparameters=hyperparameters         # hyperparameters to use in training job
)

In [11]:
huggingface_estimator.fit(
    {
        'train': 's3://sagemaker-research-methodology-extraction/01_data/03_core/unified_works_train/'
    },
    wait=False
) # {"train": training_input_path, "test": test_input_path}