In [10]:
import argparse
import boto3
import evaluate
import importlib
import json
import lighteval
import os
import pathlib
import requests
import shutil
import sys
import tarfile
import time
import torch
import transformers
import uuid
import wandb

import awswrangler as wr
import numpy as np

from botocore.exceptions import ClientError
from datasets import load_dataset, DatasetDict, Dataset
from datetime import datetime, timezone
from IPython.display import display
from peft import PeftModel, PeftConfig, get_peft_model, LoraConfig
from sagemaker import image_uris, utils as sm_utils
from sagemaker.huggingface.processing import HuggingFaceProcessor
from sagemaker.sklearn.processing import SKLearnProcessor
from sagemaker.processing import FrameworkProcessor
from sagemaker.sklearn.estimator import SKLearn
from sagemaker.workflow.steps import ProcessingStep
from sagemaker.workflow.pipeline_context import PipelineSession
from sagemaker.processing import ProcessingInput, ProcessingOutput
from sagemaker.session import get_execution_role
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from tqdm import tqdm
from transformers import (
    AutoTokenizer,
    AutoConfig, 
    AutoModelForSequenceClassification,
    DataCollatorWithPadding,
    TrainingArguments,
    Trainer
)
from typing import List, Union, Optional


# Adding ../01_modules or ./01_modules to the system path so that we can load modules from 
# there as well
if '__file__' in globals():
    script_dir = pathlib.Path(__file__).parent.resolve()
else:
    script_dir = pathlib.Path().absolute()
modules_path_in_dev = os.path.abspath(os.path.join(script_dir, '..', '01_modules'))
modules_path_in_prod = os.path.abspath(os.path.join(script_dir, '01_modules'))
if os.path.exists(modules_path_in_dev):
    sys.path.append(modules_path_in_dev)
if os.path.exists(modules_path_in_prod):
    sys.path.append(modules_path_in_prod)


# # Jupyter only reads a local module the first time after 
# # kernel start. Re-running a cell with 
# # "from mymodulename import *" would not change
# # anything, even if the imported module has since changed.
# # As a workaround, we need to directly load the module, 
# # use importlib.reload to reload it and then import * 
import utils
_ = importlib.reload(utils)
import config
_ = importlib.reload(config) 


utils.py loaded: v0.2.12
config.py loaded: v0.1


In [29]:
_ = """
ml.g6.xlarge:   6
ml.g6.2xlarge:  3
ml.g6.4xlarge:  3
ml.g6.8xlarge:  3
ml.g6.12xlarge: 3
ml.g6.16xlarge: 3
ml.g6.24xlarge: 3
ml.g6.48xlarge: 3
"""

INSTANCE_TYPE = 'ml.g6.2xlarge'
ENTRY_POINT = '05_tuning_basic/05_12_tuning_basic_simple.py',

JOB_NAME = f'hf-boto-{INSTANCE_TYPE.replace(".","-")}-{uuid.uuid4().hex}'
BOTO3_CLIENT = boto3.client('sagemaker')
S3_CLIENT = boto3.client('s3')
EXECUTION_ROLE = get_execution_role()
SCRIPT_FILEPATH = script_dir
SOURCE_DIRPATH = SCRIPT_FILEPATH.parents[0]
ROOT_DIRPATH = SCRIPT_FILEPATH.parents[1]
TEMP_DIRPATH = pathlib.Path(f'./_code/{JOB_NAME}')
TAR_FILEPATH = pathlib.Path(f'./_tar/source-{JOB_NAME}.tar.gz')
# config.DEFAULT_S3_BUCKET_NAME
print('SOURCE_DIRPATH', SOURCE_DIRPATH)

SOURCE_DIRPATH /home/sagemaker-user/research_methodology_extraction/src


In [None]:
if TEMP_DIRPATH.exists():
    shutil.rmtree(TEMP_DIRPATH.parents[1])
TEMP_DIRPATH.mkdir(parents=True, exist_ok=True)

if TAR_FILEPATH.parents[0].exists():
    shutil.rmtree(TAR_FILEPATH.parents[0])
TAR_FILEPATH.parents[0].mkdir(parents=True, exist_ok=True)

In [31]:
# Copy contents of parent dir (src_root) into temp_dir
# Avoid copying the temp_dir itself if it resides inside src_root (it does not here: temp_dir is cwd, fine)
# , '.git', 'requirements.txt', 'requirements_full.txt', '.gitignore', '
ignore_names = {'__pycache__', '.ipynb_checkpoints'}
for item in SOURCE_DIRPATH.iterdir():
    name = item.name
    if name in ignore_names:
        continue
    dest = TEMP_DIRPATH / name
    if item.is_dir():
        print('item.is_dir()', item, dest)
        for item2 in item.iterdir():
            name2 = item2.name
            if name2 in ignore_names:
                continue
            dest2 = TEMP_DIRPATH / name / name2
            if item2.is_dir():
                print('item2.is_dir() NOT COPYING', item2, dest2)
                # shutil.copytree(item, dest, ignore=shutil.ignore_patterns('__pycache__', '*.pyc', '*.pyo', '*.tmp'), dirs_exist_ok=True)
            else:
                print('else', item2, dest2)
                dest.mkdir(parents=True, exist_ok=True)
                shutil.copy2(item2, dest2)
        # shutil.copytree(item, dest, ignore=shutil.ignore_patterns('__pycache__', '*.pyc', '*.pyo', '*.tmp'), dirs_exist_ok=True)
    else:
        print('else', item, dest)
        shutil.copy2(item, dest)

shutil.copy2(ROOT_DIRPATH / 'requirements_train.txt', TEMP_DIRPATH / 'requirements.txt')

item.is_dir() /home/sagemaker-user/research_methodology_extraction/src/00_system _code/_06_hf-boto-ml-g6-2xlarge-6403b5fe9fcb47df9a5363e0cffff970/00_system
else /home/sagemaker-user/research_methodology_extraction/src/00_system/_sysinfo.ipynb _code/_06_hf-boto-ml-g6-2xlarge-6403b5fe9fcb47df9a5363e0cffff970/00_system/_sysinfo.ipynb
item2.is_dir() NOT COPYING /home/sagemaker-user/research_methodology_extraction/src/00_system/lifecycle_config_exports _code/_06_hf-boto-ml-g6-2xlarge-6403b5fe9fcb47df9a5363e0cffff970/00_system/lifecycle_config_exports
else /home/sagemaker-user/research_methodology_extraction/src/00_system/on_start.ipynb _code/_06_hf-boto-ml-g6-2xlarge-6403b5fe9fcb47df9a5363e0cffff970/00_system/on_start.ipynb
else /home/sagemaker-user/research_methodology_extraction/src/00_system/on_start.py _code/_06_hf-boto-ml-g6-2xlarge-6403b5fe9fcb47df9a5363e0cffff970/00_system/on_start.py
else /home/sagemaker-user/research_methodology_extraction/src/00_system/studio_jupyterlab_on_start.s

PosixPath('_code/_06_hf-boto-ml-g6-2xlarge-6403b5fe9fcb47df9a5363e0cffff970/requirements.txt')

In [None]:
# Tar the temp_dir (its contents become root of /opt/ml/code)
with tarfile.open(tar_path, 'w:gz') as tar:
    tar.add(str(TEMP_DIRPATH), arcname='.')


In [None]:

# 2. Upload code to S3

code_s3_key = f'training-artifacts/{job_name}/source.tar.gz'
s3.upload_file(str(tar_path), bucket, code_s3_key)
code_s3_uri = f's3://{bucket}/{code_s3_key}'

print('Uploaded code to:', code_s3_uri)

# 3. Resolve the HuggingFace DLC image (matching transformers/pytorch/py versions)
image_uri = image_uris.retrieve(
    framework='huggingface',
    region=region,
    version='4.26.0',                 # transformers version
    py_version='py39',
    instance_type=instance_type,
    image_scope='training',
    base_framework_version='pytorch1.13'
)
print('Using training image:', image_uri)

# 4. Hyperparameters (all values must be strings)
hyperparameters = {
    # SageMaker training toolkit special keys:
    'sagemaker_program': entry_point,
    'sagemaker_submit_directory': code_s3_uri,
    'sagemaker_container_log_level': '20',
    'sagemaker_region': region,

    # Your script args:
    'runtype': 'prod',
    'instance_type': instance_type,
    'model_name': 'distilbert-base-uncased',
    'hf_dataset_suffix': '_Title_SubfieldIndex',
    'label_type': 'subfield',
    'text_key': 'title',
    'text_key_rename_to': 'text',
    'label_key_rename_to': 'label',
    'sample': '1',          # must be string
    # Optional training overrides (if you want to force):
    # 'epochs': '5',
    # 'train_batch_size': '32',
    # 'eval_batch_size': '64',
    # 'warmup_steps': '500',
    # 'learning_rate': '5e-5'
}

# 5. Input channel (same as Estimator.fit inputs)
input_data_config = [
    {
        'ChannelName': 'train',
        'DataSource': {
            'S3DataSource': {
                'S3DataType': 'S3Prefix',
                'S3Uri': 's3://sagemaker-research-methodology-extraction/01_data/03_core/unified_works_train/',
                'S3DataDistributionType': 'FullyReplicated'
            }
        },
        'InputMode': 'File'
    }
]

# 6. Create training job via low-level API
sm = boto3.client('sagemaker', region_name=region)

try:
    resp = sm.create_training_job(
        TrainingJobName=job_name,
        RoleArn=role_arn,
        AlgorithmSpecification={
            'TrainingImage': image_uri,
            'TrainingInputMode': 'File'
        },
        HyperParameters=hyperparameters,
        InputDataConfig=input_data_config,
        OutputDataConfig={'S3OutputPath': f's3://{bucket}/training-output/'},
        ResourceConfig={
            'InstanceType': instance_type,
            'InstanceCount': 1,
            'VolumeSizeInGB': 300
        },
        StoppingCondition={'MaxRuntimeInSeconds': 3600},
        Environment={
            'HUGGINGFACE_HUB_CACHE': '/tmp/.cache'
        },
        EnableManagedSpotTraining=False
    )
    print('Training job created:', job_name)
except ClientError as e:
    print('create_training_job failed:')
    print(e.response.get('Error', e))
    raise

# 7. (Optional) simple waiter + log group polling
print('Polling status (CTRL+C to stop)...')
logs = boto3.client('logs', region_name=region)
log_group = '/aws/sagemaker/TrainingJobs'

def stream_logs():
    seen = set()
    while True:
        desc = sm.describe_training_job(TrainingJobName=job_name)
        status = desc['TrainingJobStatus']
        print('Status:', status)
        # Try to fetch log streams
        try:
            streams = logs.describe_log_streams(
                logGroupName=log_group,
                logStreamNamePrefix=job_name
            ).get('logStreams', [])
            for s in streams:
                name = s['logStreamName']
                events = logs.get_log_events(
                    logGroupName=log_group,
                    logStreamName=name,
                    startFromHead=True
                )['events']
                for ev in events:
                    if ev['eventId'] in seen: 
                        continue
                    seen.add(ev['eventId'])
                    print(ev['message'].rstrip())
        except logs.exceptions.ResourceNotFoundException:
            pass

        if status in ('Completed','Failed','Stopped'):
            print('Final status:', status)
            if status == 'Failed':
                print('Failure reason:', desc.get('FailureReason'))
            break
        time.sleep(30)

# Uncomment to tail inside script run:
# stream_logs()

In [9]:


hyperparameters={
    'runtype': 'prod',
    'instance_type': instance_type,
    'model_name': 'distilbert-base-uncased',
    'hf_dataset_suffix': '_Title_SubfieldIndex',
    'label_type': 'subfield',
    'text_key': 'title',
    'text_key_rename_to': 'text',
    'label_key_rename_to': 'label',
    'sample': 1,
    
    # 'epochs': 1,                                       # number of training epochs
    # 'train_batch_size': 32,                            # training batch size
    # 'model_name':'distilbert/distilbert-base-uncased'  # name of pretrained model
}

env_vars = {
    'HUGGINGFACE_HUB_CACHE': '/tmp/.cache'
}

huggingface_estimator = HuggingFace(
    entry_point='05_tuning_basic/05_12_tuning_basic_simple.py',                 # fine-tuning script to use in training job
    source_dir='..',                 # directory where fine-tuning script is stored
    max_run=1*60*60, # in seconds
    volume_size=300,
    instance_type=instance_type,          # instance type
    instance_count=1,                       # number of instances
    role=execution_role,                              # IAM role used in training job to acccess AWS resources (S3)
    transformers_version='4.26',             # Transformers version
    pytorch_version='1.13',                  # PyTorch version
    py_version='py39',                      # Python version
    disable_output_compression=True,
    environment=env_vars,
    hyperparameters=hyperparameters         # hyperparameters to use in training job
)

In [11]:
huggingface_estimator.fit(
    {
        'train': 's3://sagemaker-research-methodology-extraction/01_data/03_core/unified_works_train/'
    },
    wait=False
) # {"train": training_input_path, "test": test_input_path}