In [1]:
import requests
import boto3
import json
import os
import sys
import argparse
import importlib
import transformers
import torch
import pathlib
from IPython.display import display
from sagemaker.huggingface.processing import HuggingFaceProcessor
from sagemaker.sklearn.processing import SKLearnProcessor
from sagemaker.processing import FrameworkProcessor
from sagemaker.sklearn.estimator import SKLearn
from sagemaker.workflow.steps import ProcessingStep
from sagemaker.workflow.pipeline_context import PipelineSession
from sagemaker.processing import ProcessingInput, ProcessingOutput
from sagemaker.session import get_execution_role

# Adding ../01_modules or ./01_modules to the system path so that we can load modules from 
# there as well
if '__file__' in globals():
    script_dir = pathlib.Path(__file__).parent.resolve()
else:
    script_dir = pathlib.Path().absolute()
modules_path_in_dev = os.path.abspath(os.path.join(script_dir, '..', '01_modules'))
modules_path_in_prod = os.path.abspath(os.path.join(script_dir, '01_modules'))
if os.path.exists(modules_path_in_dev):
    sys.path.append(modules_path_in_dev)
if os.path.exists(modules_path_in_prod):
    sys.path.append(modules_path_in_prod)


# # Jupyter only reads a local module the first time after 
# # kernel start. Re-running a cell with 
# # "from mymodulename import *" would not change
# # anything, even if the imported module has since changed.
# # As a workaround, we need to directly load the module, 
# # use importlib.reload to reload it and then import * 
import utils
_ = importlib.reload(utils)
import config
_ = importlib.reload(config) 

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/sagemaker-user/.config/sagemaker/config.yaml
utils.py loaded: v0.2.12
utils.py loaded: v0.2.12
config.py loaded: v0.1
config.py loaded: v0.1


In [3]:
import sagemaker
sagemaker.__version__

'2.250.0'

In [4]:
import torch
torch.__version__

'2.5.1+cu124'

In [5]:
import transformers
transformers.__version__

'4.49.0'

In [18]:
removed = """
S3_INPUT_DIR_NAME = '00_test/input'
S3_OUTPUT_DIR_NAME = '00_test/output'

S3_INPUT_DIR_PATH = f'{config.DEFAULT_S3_BUCKET_ROOT}/{S3_INPUT_DIR_NAME}'
S3_OUTPUT_DIR_PATH = f'{config.DEFAULT_S3_BUCKET_ROOT}/{S3_OUTPUT_DIR_NAME}'



#Initialize the HuggingFaceProcessor
hfp = HuggingFaceProcessor(
    role=get_execution_role(), 
    instance_count=1,
    py_version='py311',
    transformers_version=transformers.__version__,
    pytorch_version=torch.__version__.split('+')[0],  # Get the version before any suffix like "+cpu"
    instance_type='ml.g5.2xlarge',
    base_job_name='test-job-002',
    # max_runtime_in_seconds=86400
)

#Run the processing job
hfp.run(
    code='02_ingestion/02_02_ingestion_semanticscholar.py',
    source_dir=os.path.abspath(os.path.join(os.getcwd(), '..')),
    inputs=[
        ProcessingInput(
            input_name='data',
            source=S3_INPUT_DIR_PATH,
            destination=f'{config.DEFAULT_PROCESSING_FILEPATH_PREFIX}/input/data/'
        )
    ],
    outputs=[
        ProcessingOutput(output_name='results', source=f'{config.DEFAULT_PROCESSING_FILEPATH_PREFIX}/output/results/', destination=S3_OUTPUT_DIR_PATH),
        # ProcessingOutput(output_name='train', source='/opt/ml/processing/output/train/', destination=f's3://{BUCKET}/{S3_OUTPUT_PATH}'),
        # ProcessingOutput(output_name='test', source='/opt/ml/processing/output/test/', destination=f's3://{BUCKET}/{S3_OUTPUT_PATH}'),
        # ProcessingOutput(output_name='val', source='/opt/ml/processing/output/val/', destination=f's3://{BUCKET}/{S3_OUTPUT_PATH}')
    ],
    arguments=[
        '--runtype', 'prod',
        '--test-argument-key-01', 'test-argument-value-01', 
        '--test-argument-key-02', 'test-argument-value-02'
    ]
)
"""

In [None]:
removed = """
S3_INPUT_DIR_NAME = '00_test/input'
S3_OUTPUT_DIR_NAME = '00_test/output'

S3_INPUT_DIR_PATH = f'{config.DEFAULT_S3_BUCKET_ROOT}/{S3_INPUT_DIR_NAME}'
S3_OUTPUT_DIR_PATH = f'{config.DEFAULT_S3_BUCKET_ROOT}/{S3_OUTPUT_DIR_NAME}'

# Using FrameWorkProcessor primarily to be able to use source_dir during the run step, even if we don't need the full flexibility of FrameWorkProcessor or the SKLearn framework
execution_role = get_execution_role()
# pipeline_session = PipelineSession()
print(f'Using execution role: {execution_role}')
sklearn_processor = FrameworkProcessor(
    estimator_cls=SKLearn,
    framework_version='1.2-1', # The newest supported version by sagemaker
    instance_type='ml.t3.medium', # We are just downloading&uploading data, so a small instance is sufficient
    instance_count=1,
    base_job_name='SemanticsScholar-ingestion-FrameworkProcessor-SKlearn-1-7-0',
    # sagemaker_session=pipeline_session,
    role=execution_role
)

step_args = sklearn_processor.run(
    code='02_ingestion/02_02_ingestion_semanticscholar.py',
    source_dir=os.path.abspath(os.path.join(os.getcwd(), '..')),
    inputs=[
        ProcessingInput(
            input_name='data',
            source=S3_INPUT_DIR_PATH,
            destination=f'{config.DEFAULT_PROCESSING_FILEPATH_PREFIX}/input/data/'
        )
    ],
    outputs=[
        ProcessingOutput(output_name='results', source=f'{config.DEFAULT_PROCESSING_FILEPATH_PREFIX}/output/results/', destination=S3_OUTPUT_DIR_PATH),
        # ProcessingOutput(output_name='train', source='/opt/ml/processing/output/train/', destination=f's3://{BUCKET}/{S3_OUTPUT_PATH}'),
        # ProcessingOutput(output_name='test', source='/opt/ml/processing/output/test/', destination=f's3://{BUCKET}/{S3_OUTPUT_PATH}'),
        # ProcessingOutput(output_name='val', source='/opt/ml/processing/output/val/', destination=f's3://{BUCKET}/{S3_OUTPUT_PATH}')
    ],
    arguments=[
        '--runtype', 'prod',
        '--process-files-force-overwrite', 'False',
        '--test-argument-key-01', 'test-argument-value-01', 
        '--test-argument-key-02', 'test-argument-value-02'
    ]
)

# step_process = ProcessingStep(
#     name="ProcessingNameTODO",
#     step_args=step_args
# )

# #Initialize the HuggingFaceProcessor
# hfp = HuggingFaceProcessor(
#     role=get_execution_role(), 
#     instance_count=1,
#     py_version='py311',
#     transformers_version=transformers.__version__,
#     pytorch_version=torch.__version__.split('+')[0],  # Get the version before any suffix like "+cpu"
#     instance_type='ml.g5.2xlarge',
#     base_job_name='test-job-002',
#     # max_runtime_in_seconds=86400
# )

#Run the processing job
# sklearn_processor.run(
#     code='02_ingestion/02_02_ingestion_semanticscholar.py',
#     source_dir=os.path.abspath(os.path.join(os.getcwd(), '..')),
#     inputs=[
#         ProcessingInput(
#             input_name='data',
#             source=S3_INPUT_DIR_PATH,
#             destination=f'{config.DEFAULT_PROCESSING_FILEPATH_PREFIX}/input/data/'
#         )
#     ],
#     outputs=[
#         ProcessingOutput(output_name='results', source=f'{config.DEFAULT_PROCESSING_FILEPATH_PREFIX}/output/results/', destination=S3_OUTPUT_DIR_PATH),
#         # ProcessingOutput(output_name='train', source='/opt/ml/processing/output/train/', destination=f's3://{BUCKET}/{S3_OUTPUT_PATH}'),
#         # ProcessingOutput(output_name='test', source='/opt/ml/processing/output/test/', destination=f's3://{BUCKET}/{S3_OUTPUT_PATH}'),
#         # ProcessingOutput(output_name='val', source='/opt/ml/processing/output/val/', destination=f's3://{BUCKET}/{S3_OUTPUT_PATH}')
#     ],
#     arguments=[
#         '--runtype', 'prod',
#         '--test-argument-key-01', 'test-argument-value-01', 
#         '--test-argument-key-02', 'test-argument-value-02'
#     ]
# )

"""

In [6]:

# Using FrameWorkProcessor primarily to be able to use source_dir during the run step, even if we don't need the full flexibility of FrameWorkProcessor or the SKLearn framework
execution_role = get_execution_role()
sklearn_processor = FrameworkProcessor(
    estimator_cls=SKLearn,
    framework_version='1.2-1', # The newest supported version by sagemaker
    instance_type='ml.t3.medium', # We are just downloading&uploading data, so a small instance is sufficient
    instance_count=1,
    base_job_name='SemanticsScholar-ingestion',
    role=execution_role
)

step_args = sklearn_processor.run(
    code='02_ingestion/02_02_ingestion_semanticscholar.py',
    source_dir=os.path.abspath(os.path.join(os.getcwd(), '..')),
    inputs=[], # We are not using automatic input-output mapping, instead we handle everything in the script directly on S3
    outputs=[],
    arguments=[
        '--runtype', 'prod',
        '--dataset-id', 's2orc',
        '--release-id', '2025-08-12',
        # '--force-overwrite', '0',
        '--min-index', '0',
        '--max-index', '1000000',
    ]
)

INFO:sagemaker.processing:Uploaded /home/sagemaker-user/research_methodology_extraction/src to s3://sagemaker-eu-west-2-762595428873/SemanticsScholar-ingestion-2025-08-23-19-44-07-718/source/sourcedir.tar.gz
INFO:sagemaker.processing:runproc.sh uploaded to s3://sagemaker-eu-west-2-762595428873/SemanticsScholar-ingestion-2025-08-23-19-44-07-718/source/runproc.sh
INFO:sagemaker:Creating processing-job with name SemanticsScholar-ingestion-2025-08-23-19-44-07-718


..............utils.py loaded: v0.2.12
config.py loaded: v0.1
 :: :: TIMELOGGER STARTED :: | since_start: 0.00 seconds | since_last: 0.00 seconds :: 
Keys stored in Secrets Managers for the secret "semanticscholar_api_key": ['x-api-key']
 :: Semanticscholar secret keys fetched | since_start: 0.18 seconds | since_last: 0.18 seconds :: 
Namespace(runtype='prod', dataset_id='s2orc', release_id='2025-08-12', force_overwrite=False, min_index=0, max_index=1000000)
 :: Processed arguments | since_start: 0.18 seconds | since_last: 0.00 seconds :: 
File already exists in S3: s3://sagemaker-research-methodology-extraction/01_data/01_raw/semanticscholar/s2orc/s2orc-part0.jsonl.gz. Skipping download and upload.
File already exists in S3: s3://sagemaker-research-methodology-extraction/01_data/01_raw/semanticscholar/s2orc/s2orc-part1.jsonl.gz. Skipping download and upload.
File already exists in S3: s3://sagemaker-research-methodology-extraction/01_data/01_raw/semanticscholar/s2orc/s2orc-part2.jsonl