In [1]:
import requests
import boto3
import json
import os
import sys
import argparse
import importlib
import transformers
import torch
import pathlib
import awswrangler as wr
from IPython.display import display
from sagemaker.huggingface.processing import HuggingFaceProcessor
from sagemaker.sklearn.processing import SKLearnProcessor
from sagemaker.processing import FrameworkProcessor
from sagemaker.sklearn.estimator import SKLearn
from sagemaker.workflow.steps import ProcessingStep
from sagemaker.workflow.pipeline_context import PipelineSession
from sagemaker.processing import ProcessingInput, ProcessingOutput
from sagemaker.session import get_execution_role


# Adding ../01_modules or ./01_modules to the system path so that we can load modules from 
# there as well
if '__file__' in globals():
    script_dir = pathlib.Path(__file__).parent.resolve()
else:
    script_dir = pathlib.Path().absolute()
modules_path_in_dev = os.path.abspath(os.path.join(script_dir, '..', '01_modules'))
modules_path_in_prod = os.path.abspath(os.path.join(script_dir, '01_modules'))
if os.path.exists(modules_path_in_dev):
    sys.path.append(modules_path_in_dev)
if os.path.exists(modules_path_in_prod):
    sys.path.append(modules_path_in_prod)


# # Jupyter only reads a local module the first time after 
# # kernel start. Re-running a cell with 
# # "from mymodulename import *" would not change
# # anything, even if the imported module has since changed.
# # As a workaround, we need to directly load the module, 
# # use importlib.reload to reload it and then import * 
import utils
_ = importlib.reload(utils)
import config
_ = importlib.reload(config) 


sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/sagemaker-user/.config/sagemaker/config.yaml
config.py loaded: v0.1
utils.py loaded: v0.2.12
utils.py loaded: v0.2.12
config.py loaded: v0.1


In [2]:
if False: # do not run this again, takes a day
    execution_role = get_execution_role()
    source_dir = os.path.abspath(os.path.join(os.getcwd(), '..', '..'))
    print('source_dir:', source_dir)
    sklearn_processor = FrameworkProcessor(
        estimator_cls=SKLearn,
        framework_version='1.2-1', # The newest supported version by sagemaker
        instance_type='ml.c7i.16xlarge',
        instance_count=1,
        base_job_name=f'openalex_works_reduction'.replace('_','-'),
        role=execution_role
    )
    
    step_args = sklearn_processor.run(
        code='src/03_transformation/03_11_transformation_openalex_works_reduction.py',
        source_dir=source_dir,
        inputs=[], # We are not using automatic input-output mapping, instead we handle everything in the script directly on S3
        outputs=[],
        arguments=[
            '--runtype', 'prod',
            '--file-max-limit', '10000',
        ],
        wait=True
    )

In [3]:
timelogger = utils.TimeLogger()
utils.create_table_from_sql_file(
    database_name = '02_stg',
    table_name = 'base_openalex_works_reduced',
    overwrite_strategy='overwrite', # options: fail, overwrite, ignore
    wait=True,
)
timelogger.log('"02_stg".base_openalex_works_reduced created')

 :: :: TIMELOGGER STARTED :: | since_start: 0.00 seconds | since_last: 0.00 seconds :: 
Table 02_stg.base_openalex_works_reduced already exists. Overwriting since overwrite_strategy=="overwrite".
Deleting table from Glue Catalog 02_stg base_openalex_works_reduced
Deleting S3 objects from s3://sagemaker-research-methodology-extraction/01_data/02_stg/base_openalex_works_reduced/
s3_parent_target_path:  s3://sagemaker-research-methodology-extraction/01_data/02_stg
 :: "02_stg".base_openalex_works_reduced created | since_start: 1.0 minute, 42.04 seconds | since_last: 1.0 minute, 42.04 seconds :: 


' :: "02_stg".base_openalex_works_reduced created | since_start: 1.0 minute, 42.04 seconds | since_last: 1.0 minute, 42.04 seconds :: '

In [4]:
utils.pd_set_options()
display(wr.athena.read_sql_query("""SELECT * FROM "02_stg".base_openalex_works_reduced LIMIT 5 """, '02_stg'))
display(wr.athena.read_sql_query("""SELECT COUNT(*) AS c FROM "02_stg".base_openalex_works_reduced """, '02_stg'))

Unnamed: 0,id_openalex,id_doi,title,language,primary_topic_id,primary_topic_display_name,primary_topic_subfield_id,primary_topic_subfield_display_name,primary_topic_field_id,primary_topic_field_display_name,primary_topic_domain_id,primary_topic_domain_display_name
0,269677805,,2. Profil type du détenu politique à Eysses,fr,T10153,"Education, sociology, and vocational training",3312,Sociology and Political Science,33,Social Sciences,2,Social Sciences
1,2696780303,,Analisa Struktur Dan Material Speed Bump Dengan Bahan Concrete Foam Untuk Penggerak Tenaga Listrik,id,T13674,Computer Science and Engineering,1702,Artificial Intelligence,17,Computer Science,3,Physical Sciences
2,2696784097,,Strategies of survival during the holocaust,en,T11203,Jewish and Middle Eastern Studies,3312,Sociology and Political Science,33,Social Sciences,2,Social Sciences
3,2696788076,,Business intelligence and Marketing analytics/Inteligencia de negocio y análisis de datos,,T11891,Big Data and Business Intelligence,1404,Management Information Systems,14,"Business, Management and Accounting",2,Social Sciences
4,2696777682,,El papel transversal de la lectura en el currículo,es,T13061,Literacy and Educational Practices,3304,Education,33,Social Sciences,2,Social Sciences


Unnamed: 0,c
0,270051911


In [7]:
timelogger = utils.TimeLogger()
utils.create_table_from_sql_file(
    database_name = '02_stg',
    table_name = 'base_semanticscholar_s2orcv2',
    overwrite_strategy='overwrite', # options: fail, overwrite, ignore
    wait=True,
)
timelogger.log('"02_stg".base_semanticscholar_s2orcv2 created')

 :: :: TIMELOGGER STARTED :: | since_start: 0.00 seconds | since_last: 0.00 seconds :: 
Table 02_stg.base_semanticscholar_s2orcv2 already exists. Overwriting since overwrite_strategy=="overwrite".
Deleting table from Glue Catalog 02_stg base_semanticscholar_s2orcv2
Deleting S3 objects from s3://sagemaker-research-methodology-extraction/01_data/02_stg/base_semanticscholar_s2orcv2/
s3_parent_target_path:  s3://sagemaker-research-methodology-extraction/01_data/02_stg
 :: "02_stg".base_semanticscholar_s2orcv2 created | since_start: 10.0 minutes, 44.14 seconds | since_last: 10.0 minutes, 44.14 seconds :: 


' :: "02_stg".base_semanticscholar_s2orcv2 created | since_start: 10.0 minutes, 44.14 seconds | since_last: 10.0 minutes, 44.14 seconds :: '

In [8]:
utils.pd_set_options(cols=500)
display(wr.athena.read_sql_query("""SELECT * FROM "02_stg".base_semanticscholar_s2orcv2 LIMIT 5 """, '02_stg'))
display(wr.athena.read_sql_query("""SELECT COUNT(*) AS c FROM "02_stg".base_semanticscholar_s2orcv2 """, '02_stg'))

Unnamed: 0,id_semanticscholar,id_mag,id_doi,id_arxiv,title,license,source_url,openaccess_status,content_text,annotations_paragraph,annotations_section_header
0,268024248,,10.30994/jhsc.v4i3.214,,"Mother's Motivational Factors Regarding Basic Immunization Completeness for 3 Year Old Toddlers at the ""W"" Mimika Timika Papua Health Center",CCBYSA,https://doi.org/10.30994/jhsc.v4i3.214,HYBRID,"\nINTRODUCTION\n\nLow immunization coverage is an indicator of deaths due to VPD. Therefore, one program that has been proven effective in reducing morbidity and mortality due to VPD is immunization. This is in line with the MDG agreement, where achieving a reduction in infant mortality is characterized by increasing immunization coverage, especially seen from the measles immunization coverage rate (WHO, 2010). This is because measles is the last immunization for basic immunization and is an...","[{""attributes"":null,""end"":1154,""start"":15},{""attributes"":null,""end"":1501,""start"":1156},{""attributes"":null,""end"":1974,""start"":1503},{""attributes"":null,""end"":2511,""start"":1976},{""attributes"":null,""end"":2743,""start"":2513},{""attributes"":null,""end"":4202,""start"":2753},{""attributes"":null,""end"":5014,""start"":4204},{""attributes"":null,""end"":6142,""start"":5016},{""attributes"":null,""end"":7187,""start"":6144},{""attributes"":null,""end"":7305,""start"":7301},{""attributes"":null,""end"":8620,""start"":7331},{""attributes""...","[{""attributes"":null,""end"":13,""start"":1},{""attributes"":null,""end"":2751,""start"":2745},{""attributes"":null,""end"":7211,""start"":7189},{""attributes"":null,""end"":7273,""start"":7213},{""attributes"":null,""end"":7299,""start"":7275},{""attributes"":null,""end"":7317,""start"":7307},{""attributes"":null,""end"":7329,""start"":7319},{""attributes"":null,""end"":8644,""start"":8622},{""attributes"":null,""end"":9413,""start"":9345},{""attributes"":null,""end"":11541,""start"":11531}]"
1,259924751,,10.1109/ISBI53787.2023.10230709,2307.07177,TriFormer: A Multi-modal Transformer Framework For Mild Cognitive Impairment Conversion Prediction,,https://arxiv.org/abs/2307.07177,GREEN,\nINTRODUCTION\n\nMild cognitive impairment (MCI) patients exhibit a memory impairment earlier than the expected age. It is a transitional stage from Congnitively normal (CN) to Alzheimer's disease (AD) where around 10% to 15% MCI patients end up progressing to AD every year [1]. Patients having MCI can either progress to AD within several years defined as progressive MCI (pMCI) or stay at the same MCI stage defined as stable MCI (sMCI). Previous studies have shown that early nonpharmacologi...,"[{""attributes"":null,""end"":964,""start"":15},{""attributes"":null,""end"":2895,""start"":966},{""attributes"":null,""end"":2954,""start"":2897},{""attributes"":null,""end"":3357,""start"":2965},{""attributes"":null,""end"":4812,""start"":3384},{""attributes"":null,""end"":4995,""start"":4814},{""attributes"":null,""end"":5050,""start"":4997},{""attributes"":null,""end"":5151,""start"":5052},{""attributes"":null,""end"":5255,""start"":5153},{""attributes"":null,""end"":5870,""start"":5285},{""attributes"":null,""end"":6311,""start"":5901},{""attributes"":n...","[{""attributes"":{""n"":""1.""},""end"":13,""start"":1},{""attributes"":{""n"":""2.""},""end"":2963,""start"":2956},{""attributes"":{""n"":""2.1.""},""end"":3382,""start"":3359},{""attributes"":{""n"":""2.2.""},""end"":5283,""start"":5257},{""attributes"":{""n"":""2.3.""},""end"":5899,""start"":5872},{""attributes"":{""n"":""3.""},""end"":6335,""start"":6313},{""attributes"":{""n"":""3.1.""},""end"":6344,""start"":6337},{""attributes"":{""n"":""3.2.""},""end"":7896,""start"":7882},{""attributes"":{""n"":""3.3.""},""end"":9703,""start"":9662},{""attributes"":{""n"":""4.""},""end"":9715,""s..."
2,225569631,3044367686.0,10.3390/met10070979,,Residual Stress Analysis of a 2219 Aluminum Alloy Ring Using the Indentation Strain-Gauge Method,CCBY,https://doi.org/10.3390/met10070979,GOLD,"\nIntroduction\n\nMonolithic thin-walled parts are widely used in the aviation and aerospace production due to their high specific stiffness and strength. Their thin-walled parts are manufactured by turning and milling, in which case, more than 90% of the material is removed from the blank [1]. However, clear distortion also occurs to the workpiece in the machining process, which causes manufacturing errors and assembly difficulties. \n\nAccording to a large number of simulation and experime...","[{""attributes"":null,""end"":435,""start"":15},{""attributes"":null,""end"":1192,""start"":437},{""attributes"":null,""end"":2828,""start"":1194},{""attributes"":null,""end"":4055,""start"":2830},{""attributes"":null,""end"":5041,""start"":4057},{""attributes"":null,""end"":6300,""start"":5043},{""attributes"":null,""end"":6948,""start"":6366},{""attributes"":null,""end"":8402,""start"":6972},{""attributes"":null,""end"":9369,""start"":8404},{""attributes"":null,""end"":9519,""start"":9371},{""attributes"":null,""end"":9553,""start"":9521},{""attributes"":n...","[{""attributes"":{""n"":""1.""},""end"":13,""start"":1},{""attributes"":{""n"":""2.""},""end"":6349,""start"":6302},{""attributes"":{""n"":""2.1.""},""end"":6364,""start"":6351},{""attributes"":{""n"":""2.2.""},""end"":6970,""start"":6950},{""attributes"":{""n"":""2.3.""},""end"":9603,""start"":9555},{""attributes"":null,""end"":11020,""start"":11019},{""attributes"":null,""end"":12502,""start"":12501},{""attributes"":{""n"":""3.""},""end"":15243,""start"":15175},{""attributes"":{""n"":""3.""},""end"":15966,""start"":15898},{""attributes"":{""n"":""3.""},""end"":16689,""start"":166..."
3,226421097,3036002560.0,10.1051/e3sconf/202017401063,,Adjustment of the Exploration Grids and its use to increase the Reliability of Geological Models of Coal Deposits,CCBY,https://doi.org/10.1051/e3sconf/202017401063,GOLD,"\nIntroduction\n\nThe estimate of mineral resources and reserves is an essential task for mining engineers. The geological models of deposits built during its implementation used as a basis for detailed mine designing and planning [1]. Therefore, the quality of models directly affects the quality of mining projects. \n\nThe role of geological modelling is significantly increasing in connection with the development of the mining industry digitalization. Without reliable geological models, it ...","[{""attributes"":null,""end"":315,""start"":15},{""attributes"":null,""end"":925,""start"":317},{""attributes"":null,""end"":1751,""start"":927},{""attributes"":null,""end"":2199,""start"":1753},{""attributes"":null,""end"":2559,""start"":2201},{""attributes"":null,""end"":3074,""start"":2582},{""attributes"":null,""end"":3296,""start"":3076},{""attributes"":null,""end"":3738,""start"":3298},{""attributes"":null,""end"":4463,""start"":3740},{""attributes"":null,""end"":5216,""start"":4465},{""attributes"":null,""end"":5481,""start"":5218},{""attributes"":nul...","[{""attributes"":{""n"":""1""},""end"":13,""start"":1},{""attributes"":{""n"":""2""},""end"":2580,""start"":2561},{""attributes"":{""n"":""3""},""end"":8230,""start"":8208},{""attributes"":{""n"":""4""},""end"":11660,""start"":11650}]"
4,221446156,,,2009.0089,Skyrmion-antiskyrmion droplets in a chiral ferromagnet,,https://arxiv.org/abs/2009.00890,,"\nI. INTRODUCTION\n\nSince the experimental observation of skyrmions in ferromagnetic materials with the Dzyaloshinskii-Moriya (DM) interaction, a substantial amount of work has been devoted to their statics and dynamics [1]. Chiral skyrmions are topological solitons that have the same topological features as magnetic bubbles [2], but the detailed features of the chiral skyrmion profile are specific to it [3,4]. Most work has largely focused on the axially symmetric chiral skyrmion predicted...","[{""attributes"":null,""end"":837,""start"":18},{""attributes"":null,""end"":1478,""start"":839},{""attributes"":null,""end"":2147,""start"":1480},{""attributes"":null,""end"":2624,""start"":2149},{""attributes"":null,""end"":3031,""start"":2626},{""attributes"":null,""end"":3327,""start"":3050},{""attributes"":null,""end"":3819,""start"":3329},{""attributes"":null,""end"":4015,""start"":3821},{""attributes"":null,""end"":4195,""start"":4017},{""attributes"":null,""end"":4265,""start"":4197},{""attributes"":null,""end"":4287,""start"":4267},{""attributes"":n...","[{""attributes"":null,""end"":16,""start"":1},{""attributes"":null,""end"":3048,""start"":3033},{""attributes"":null,""end"":5903,""start"":5868},{""attributes"":null,""end"":10945,""start"":10908},{""attributes"":null,""end"":19404,""start"":19374},{""attributes"":null,""end"":21204,""start"":21182}]"


Unnamed: 0,c
0,11609787
