In [None]:
import requests
import boto3
import json
import os
import sys
import argparse
import importlib
import transformers
import torch
import pathlib
from IPython.display import display
from sagemaker.huggingface.processing import HuggingFaceProcessor
from sagemaker.processing import ProcessingInput, ProcessingOutput
from sagemaker.session import get_execution_role

# Adding ../01_modules or ./01_modules to the system path so that we can load modules from 
# there as well
if '__file__' in globals():
    script_dir = pathlib.Path(__file__).parent.resolve()
else:
    script_dir = pathlib.Path().absolute()
modules_path_in_notebook = os.path.abspath(os.path.join(script_dir, '..', '01_modules'))
modules_path_in_processing_script = os.path.abspath(os.path.join(script_dir, '01_modules'))
if os.path.exists(modules_path_in_notebook):
    sys.path.append(modules_path_in_notebook)
if os.path.exists(modules_path_in_processing_script):
    sys.path.append(modules_path_in_processing_script)


# # Jupyter only reads a local module the first time after 
# # kernel start. Re-running a cell with 
# # "from mymodulename import *" would not change
# # anything, even if the imported module has since changed.
# # As a workaround, we need to directly load the module, 
# # use importlib.reload to reload it and then import * 
import utils
_ = importlib.reload(utils)
import config
_ = importlib.reload(config) 

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/sagemaker-user/.config/sagemaker/config.yaml
utils.py loaded: v0.2.12
utils.py loaded: v0.2.12
config.py loaded: v0.1
config.py loaded: v0.1


In [None]:
# !pip install "numpy<2.3.0" "sagemaker==2.250.0" "transformers==4.48.0" "boto3==1.38.46" "awscli==1.40.45" # "torch==2.3.0" 
# 1.39.0, 1.40.0, 1.40.1, 1.40.2, , 1.40.4, 1.40.5, 1.40.6, 1.40.7, , , 1.40.10, 1.40.11, , 1.40.13, 1.40.14, 1.40.15, 1.40.16, , 1.40.18, 1.40.19, 1.40.20, 1.40.21, 1.40.22, 1.40.23, 1.40.24, 1.40.25, 1.40.26, 1.40.27, 1.40.28, 1.40.29, 1.40.30, 1.40.31, 1.40.32, 1.40.33, 1.40.34, 1.40.35, 1.40.36, 1.40.37, 1.40.38, 1.40.39, 1.40.40, 1.40.41, 1.40.42, 1.40.43, 1.40.44, , 1.41.0, 1.41.1, 1.41.2, 1.41.3, 1.41.4, 1.41.5, 1.41.6, 1.41.7, 1.41.8, 1.41.9, 1.41.10, 1.41.11, 1.41.12, 1.41.13, 1.41.14, 1.41.15, 1.41.16, 1.41.17, 1.42.0, 1.42.1, 1.42.2, 1.42.3, 1.42.4, 1.42.5, 1.42.6, 1.42.7, 1.42.8, 1.42.9, 1.42.10, 1.42.11, 1.42.12, 1.42.13
# !pip install "torch==2.3.0" --index-url https://download.pytorch.org/whl/cpu
# !python --version
# !pip uninstall torch -y

Found existing installation: torch 2.3.0+cpu
Uninstalling torch-2.3.0+cpu:
  Successfully uninstalled torch-2.3.0+cpu


In [2]:
import sagemaker
sagemaker.__version__

'2.250.0'

In [3]:
import torch
torch.__version__

'2.5.1+cu124'

In [4]:
import transformers
transformers.__version__

'4.49.0'

In [5]:
S3_INPUT_DIR_NAME = '00_test/input'
S3_OUTPUT_DIR_NAME = '00_test/output'

S3_INPUT_DIR_PATH = f'{config.DEFAULT_S3_BUCKET_ROOT}/{S3_INPUT_DIR_NAME}'
S3_OUTPUT_DIR_PATH = f'{config.DEFAULT_S3_BUCKET_ROOT}/{S3_OUTPUT_DIR_NAME}'



#Initialize the HuggingFaceProcessor
hfp = HuggingFaceProcessor(
    role=get_execution_role(), 
    instance_count=1,
    py_version='py311',
    transformers_version=transformers.__version__,
    pytorch_version=torch.__version__.split('+')[0],  # Get the version before any suffix like "+cpu"
    instance_type='ml.g5.2xlarge',
    base_job_name='test-job-002',
    # max_runtime_in_seconds=86400
)

#Run the processing job
hfp.run(
    code='02_ingestion/02_02_ingestion_semanticscholar.py',
    source_dir=os.path.abspath(os.path.join(os.getcwd(), '..')),
    inputs=[
        ProcessingInput(
            input_name='data',
            source=S3_INPUT_DIR_PATH,
            destination=f'{config.DEFAULT_PROCESSING_FILEPATH_PREFIX}/input/data/'
        )
    ],
    outputs=[
        ProcessingOutput(output_name='results', source=f'{config.DEFAULT_PROCESSING_FILEPATH_PREFIX}/output/results/', destination=S3_OUTPUT_DIR_PATH),
        # ProcessingOutput(output_name='train', source='/opt/ml/processing/output/train/', destination=f's3://{BUCKET}/{S3_OUTPUT_PATH}'),
        # ProcessingOutput(output_name='test', source='/opt/ml/processing/output/test/', destination=f's3://{BUCKET}/{S3_OUTPUT_PATH}'),
        # ProcessingOutput(output_name='val', source='/opt/ml/processing/output/val/', destination=f's3://{BUCKET}/{S3_OUTPUT_PATH}')
    ],
    arguments=[
        '--runtype', 'prod',
        '--test-argument-key-01', 'test-argument-value-01', 
        '--test-argument-key-02', 'test-argument-value-02'
    ]
)

INFO:sagemaker:Creating processing-job with name test-job-002-2025-08-21-00-05-12-396


............................/opt/ml/processing/input/code
total 36
drwxr-xr-x 4 1000 users  4096 Aug 20 21:07 00_system
drwxr-xr-x 3 1000 users  4096 Aug 20 20:53 01_modules
drwxr-xr-x 4 1000 users  4096 Aug 20 23:59 02_ingestion
-rw-r--r-- 1 root root  24369 Aug 21 00:06 sourcedir.tar.gz
utils.py loaded: v0.2.12
utils.py loaded: v0.2.12
config.py loaded: v0.1
config.py loaded: v0.1
Keys stored in Secrets Managers for the secret "semanticscholar_api_key": ['x-api-key']
['2022-05-10', '2022-05-17', '2022-05-24', '2022-05-31', '2022-06-07', '2022-06-14', '2022-06-21', '2022-06-28', '2022-07-05', '2022-07-19', '2022-07-28', '2022-08-02', '2022-08-09', '2022-08-16', '2022-08-23', '2022-08-30', '2022-09-06', '2022-09-13', '2022-09-28', '2022-10-05', '2022-10-28', '2022-11-02', '2022-11-11', '2022-11-15', '2022-11-22', '2022-12-02', '2022-12-06', '2022-12-13', '2022-12-20', '2022-12-27', '2023-01-03', '2023-01-10', '2023-01-17', '2023-01-24', '2023-01-31', '2023-02-07', '2023-02-14', '2023-0