In [2]:
import requests
import boto3
import json
import os
import sys
import argparse
import importlib
import transformers
import torch
import pathlib
import awswrangler as wr
from IPython.display import display
from sagemaker.huggingface.processing import HuggingFaceProcessor
from sagemaker.sklearn.processing import SKLearnProcessor
from sagemaker.processing import FrameworkProcessor
from sagemaker.sklearn.estimator import SKLearn
from sagemaker.workflow.steps import ProcessingStep
from sagemaker.workflow.pipeline_context import PipelineSession
from sagemaker.processing import ProcessingInput, ProcessingOutput
from sagemaker.session import get_execution_role


# Adding ../01_modules or ./01_modules to the system path so that we can load modules from 
# there as well
if '__file__' in globals():
    script_dir = pathlib.Path(__file__).parent.resolve()
else:
    script_dir = pathlib.Path().absolute()
modules_path_in_dev = os.path.abspath(os.path.join(script_dir, '..', '01_modules'))
modules_path_in_prod = os.path.abspath(os.path.join(script_dir, '01_modules'))
if os.path.exists(modules_path_in_dev):
    sys.path.append(modules_path_in_dev)
if os.path.exists(modules_path_in_prod):
    sys.path.append(modules_path_in_prod)


# # Jupyter only reads a local module the first time after 
# # kernel start. Re-running a cell with 
# # "from mymodulename import *" would not change
# # anything, even if the imported module has since changed.
# # As a workaround, we need to directly load the module, 
# # use importlib.reload to reload it and then import * 
import utils
_ = importlib.reload(utils)
import config
_ = importlib.reload(config) 


sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/sagemaker-user/.config/sagemaker/config.yaml
config.py loaded: v0.1
utils.py loaded: v0.2.12
utils.py loaded: v0.2.12
config.py loaded: v0.1


In [7]:
execution_role = get_execution_role()
source_dir = os.path.abspath(os.path.join(os.getcwd(), '..', '..'))
print('source_dir:', source_dir)
sklearn_processor = FrameworkProcessor(
    estimator_cls=SKLearn,
    framework_version='1.2-1', # The newest supported version by sagemaker
    instance_type='ml.c7i.16xlarge',
    instance_count=1,
    base_job_name=f'openalex_works_reduction'.replace('_','-'),
    role=execution_role
)

step_args = sklearn_processor.run(
    code='src/03_transformation/03_11_transformation_openalex_works_reduction.py',
    source_dir=source_dir,
    inputs=[], # We are not using automatic input-output mapping, instead we handle everything in the script directly on S3
    outputs=[],
    arguments=[
        '--runtype', 'prod',
        '--file-max-limit', '10000',
    ],
    wait=True
)

source_dir: /home/sagemaker-user/research_methodology_extraction


INFO:sagemaker.processing:Uploaded /home/sagemaker-user/research_methodology_extraction to s3://sagemaker-eu-west-2-762595428873/openalex-works-reduction-2025-08-28-17-31-02-718/source/sourcedir.tar.gz
INFO:sagemaker.processing:runproc.sh uploaded to s3://sagemaker-eu-west-2-762595428873/openalex-works-reduction-2025-08-28-17-31-02-718/source/runproc.sh
INFO:sagemaker:Creating processing-job with name openalex-works-reduction-2025-08-28-17-31-02-718


.............[34mCodeArtifact repository not specified. Skipping login.[0m
[34mFound existing installation: typing 3.7.4.3[0m
[34mUninstalling typing-3.7.4.3:
  Successfully uninstalled typing-3.7.4.3[0m
[34mCollecting smart-open==7.3.0 (from smart-open[s3]==7.3.0->-r requirements.txt (line 1))
  Downloading smart_open-7.3.0-py3-none-any.whl.metadata (24 kB)[0m
[34mCollecting awswrangler==3.12.1 (from -r requirements.txt (line 2))
  Downloading awswrangler-3.12.1-py3-none-any.whl.metadata (17 kB)[0m
[34mCollecting wrapt (from smart-open==7.3.0->smart-open[s3]==7.3.0->-r requirements.txt (line 1))
  Downloading wrapt-1.17.3-cp39-cp39-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl.metadata (6.4 kB)[0m
[34mCollecting numpy<3.0,>=1.26 (from awswrangler==3.12.1->-r requirements.txt (line 2))
  Downloading numpy-2.0.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (60 kB)[0m
[34mCollecting pandas<3.0.0,>=1.2.0 (from awswrangler==3.12.1->-r 

In [6]:
timelogger = utils.TimeLogger()
utils.create_table_from_sql_file(
    database_name = '02_stg',
    table_name = 'base_openalex_works_reduced',
    overwrite_strategy='overwrite', # options: fail, overwrite, ignore
    wait=True,
)
timelogger.log('"02_stg".base_openalex_works_reduced created')
# TODO: re-run after full manual transformation finished:
# https://eu-west-2.console.aws.amazon.com/cloudwatch/home?region=eu-west-2#logsV2:log-groups/log-group/$252Faws$252Fsagemaker$252FProcessingJobs/log-events/openalex-works-reduction-2025-08-28-17-31-02-718$252Falgo-1-1756402311

 :: :: TIMELOGGER STARTED :: | since_start: 0.00 seconds | since_last: 0.00 seconds :: 
Table 02_stg.base_openalex_works_reduced already exists. Overwriting since overwrite_strategy=="overwrite".
Deleting table from Glue Catalog 02_stg base_openalex_works_reduced
Deleting S3 objects from s3://sagemaker-research-methodology-extraction/01_data/02_stg/base_openalex_works_reduced/
s3_parent_target_path:  s3://sagemaker-research-methodology-extraction/01_data/02_stg
 :: "02_stg".base_openalex_works_reduced created | since_start: 1.0 minute, 8.45 seconds | since_last: 1.0 minute, 8.45 seconds :: 


' :: "02_stg".base_openalex_works_reduced created | since_start: 1.0 minute, 8.45 seconds | since_last: 1.0 minute, 8.45 seconds :: '

In [7]:
utils.pd_set_options()
display(wr.athena.read_sql_query("""SELECT * FROM "02_stg".base_openalex_works_reduced LIMIT 5 """, '02_stg'))
display(wr.athena.read_sql_query("""SELECT COUNT(*) AS c FROM "02_stg".base_openalex_works_reduced """, '02_stg'))

Unnamed: 0,id_openalex,id_doi,title,language,primary_topic_id,primary_topic_display_name,primary_topic_subfield_id,primary_topic_subfield_display_name,primary_topic_field_id,primary_topic_field_display_name,primary_topic_domain_id,primary_topic_domain_display_name
0,3165205644,,Research Sources & Guides: Healthcare Industry: Company Info,en,T11792,Pharmaceutical Economics and Policy,2002,Economics and Econometrics,20,"Economics, Econometrics and Finance",2,Social Sciences
1,3165205739,,Research Guides: Marketing Research: Home,en,T11536,Consumer Retail Behavior Studies,1406,Marketing,14,"Business, Management and Accounting",2,Social Sciences
2,3165204939,,Dimensiones y categorías de la eficacia docente universitaria: una propuesta de indicadores,es,T14483,Education and Teacher Training,3304,Education,33,Social Sciences,2,Social Sciences
3,3165205348,,Guides: Sports Business Management: Dissertations and theses,en,T11474,Sport and Mega-Event Impacts,3312,Sociology and Political Science,33,Social Sciences,2,Social Sciences
4,3165203657,,LibGuides: Sähkötekniikka: Aineistot kirjastossa,fi,T12918,Healthcare Systems and Technology,1407,Organizational Behavior and Human Resource Management,14,"Business, Management and Accounting",2,Social Sciences


Unnamed: 0,c
0,147565806
