In [2]:
import requests
import boto3
import json
import os
import sys
import argparse
import importlib
import transformers
import torch
import pathlib
import smart_open
from IPython.display import display
from sagemaker.huggingface.processing import HuggingFaceProcessor
from sagemaker.sklearn.processing import SKLearnProcessor
from sagemaker.processing import FrameworkProcessor
from sagemaker.sklearn.estimator import SKLearn
from sagemaker.workflow.steps import ProcessingStep
from sagemaker.workflow.pipeline_context import PipelineSession
from sagemaker.processing import ProcessingInput, ProcessingOutput
from sagemaker.session import get_execution_role

# Adding ../01_modules or ./01_modules to the system path so that we can load modules from 
# there as well
if '__file__' in globals():
    script_dir = pathlib.Path(__file__).parent.resolve()
else:
    script_dir = pathlib.Path().absolute()
modules_path_in_dev = os.path.abspath(os.path.join(script_dir, '..', '01_modules'))
modules_path_in_prod = os.path.abspath(os.path.join(script_dir, '01_modules'))
if os.path.exists(modules_path_in_dev):
    sys.path.append(modules_path_in_dev)
if os.path.exists(modules_path_in_prod):
    sys.path.append(modules_path_in_prod)


# # Jupyter only reads a local module the first time after 
# # kernel start. Re-running a cell with 
# # "from mymodulename import *" would not change
# # anything, even if the imported module has since changed.
# # As a workaround, we need to directly load the module, 
# # use importlib.reload to reload it and then import * 
import utils
_ = importlib.reload(utils)
import config
_ = importlib.reload(config) 

utils.py loaded: v0.2.12
config.py loaded: v0.1


In [11]:
s3_client = boto3.client('s3')
openalex_works_target_prefix = f'{config.OPENALEX_S3_RAW_DATA_PREFIX}/data/works_unpartitioned'
openalex_works_target_path = f's3://{config.DEFAULT_S3_BUCKET_NAME}/{openalex_works_target_prefix}'

_ = """
for key, content in smart_open.s3.iter_bucket(
    bucket_name=config.DEFAULT_S3_BUCKET_NAME,
    prefix=openalex_works_target_prefix,
    accept_key=None,
    key_limit=16,
    workers=16,
    retries=3,
):
    print(key, round(len(content) / 2**10))
"""

file_counter = 0
file_limit = 2
line_limit = 10
total_record_counter = 0
total_record_limit = 15
files = s3_client.list_objects(Bucket=config.DEFAULT_S3_BUCKET_NAME, Prefix='', Delimiter='/', )
print(files)
for file_ref in files['Contents']:
    if file_counter < line_limit:
        line_counter = 0
        source_filepath = file_ref['Key']
        with smart_open.open(bucket_id=config.DEFAULT_S3_BUCKET_NAME, key_id=source_filepath, mode='rb', client=s3_client) as fl:
            for line in fl:
                if line_counter >= line_limit or total_record_counter >= total_record_limit:
                    break
                print(f'{total_record_counter:06}:{line_counter:06}|{line}')
                line_counter += 1
                total_record_counter += 1
        file_counter += 1


{'ResponseMetadata': {'RequestId': 'XY4983GACFX32MNW', 'HostId': 'es4c7LUDyUTmylKbbR8glcWexJ0UJ/Q5ybSz8Iw3o24lWkWmExfQQtqlWiovjQjYoDT4i1IEKgc=', 'HTTPStatusCode': 200, 'HTTPHeaders': {'x-amz-id-2': 'es4c7LUDyUTmylKbbR8glcWexJ0UJ/Q5ybSz8Iw3o24lWkWmExfQQtqlWiovjQjYoDT4i1IEKgc=', 'x-amz-request-id': 'XY4983GACFX32MNW', 'date': 'Wed, 27 Aug 2025 20:33:30 GMT', 'x-amz-bucket-region': 'eu-west-2', 'content-type': 'application/xml', 'transfer-encoding': 'chunked', 'server': 'AmazonS3'}, 'RetryAttempts': 0}, 'IsTruncated': False, 'Marker': '', 'Name': 'sagemaker-research-methodology-extraction', 'Prefix': '', 'Delimiter': '/', 'MaxKeys': 1000, 'CommonPrefixes': [{'Prefix': '00_test/'}, {'Prefix': '01_data/'}, {'Prefix': '01_raw/'}], 'EncodingType': 'url'}
