In [2]:
import requests
import boto3
import json
import os
import sys
import argparse
import importlib
import transformers
import torch
import pathlib
import awswrangler as wr
from IPython.display import display
from sagemaker.huggingface.processing import HuggingFaceProcessor
from sagemaker.sklearn.processing import SKLearnProcessor
from sagemaker.processing import FrameworkProcessor
from sagemaker.sklearn.estimator import SKLearn
from sagemaker.workflow.steps import ProcessingStep
from sagemaker.workflow.pipeline_context import PipelineSession
from sagemaker.processing import ProcessingInput, ProcessingOutput
from sagemaker.session import get_execution_role


# Adding ../01_modules or ./01_modules to the system path so that we can load modules from 
# there as well
if '__file__' in globals():
    script_dir = pathlib.Path(__file__).parent.resolve()
else:
    script_dir = pathlib.Path().absolute()
modules_path_in_dev = os.path.abspath(os.path.join(script_dir, '..', '01_modules'))
modules_path_in_prod = os.path.abspath(os.path.join(script_dir, '01_modules'))
if os.path.exists(modules_path_in_dev):
    sys.path.append(modules_path_in_dev)
if os.path.exists(modules_path_in_prod):
    sys.path.append(modules_path_in_prod)


# # Jupyter only reads a local module the first time after 
# # kernel start. Re-running a cell with 
# # "from mymodulename import *" would not change
# # anything, even if the imported module has since changed.
# # As a workaround, we need to directly load the module, 
# # use importlib.reload to reload it and then import * 
import utils
_ = importlib.reload(utils)
import config
_ = importlib.reload(config) 


sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/sagemaker-user/.config/sagemaker/config.yaml
config.py loaded: v0.1
utils.py loaded: v0.2.12
utils.py loaded: v0.2.12
config.py loaded: v0.1


In [2]:
if False: # do not run this again, takes a day
    execution_role = get_execution_role()
    source_dir = os.path.abspath(os.path.join(os.getcwd(), '..', '..'))
    print('source_dir:', source_dir)
    sklearn_processor = FrameworkProcessor(
        estimator_cls=SKLearn,
        framework_version='1.2-1', # The newest supported version by sagemaker
        instance_type='ml.c7i.16xlarge',
        instance_count=1,
        base_job_name=f'openalex_works_reduction'.replace('_','-'),
        role=execution_role
    )
    
    step_args = sklearn_processor.run(
        code='src/03_transformation/03_11_transformation_openalex_works_reduction.py',
        source_dir=source_dir,
        inputs=[], # We are not using automatic input-output mapping, instead we handle everything in the script directly on S3
        outputs=[],
        arguments=[
            '--runtype', 'prod',
            '--file-max-limit', '10000',
        ],
        wait=True
    )

In [3]:
timelogger = utils.TimeLogger()
utils.create_table_from_sql_file(
    database_name = '02_stg',
    table_name = 'base_openalex_works_reduced',
    overwrite_strategy='overwrite', # options: fail, overwrite, ignore
    wait=True,
)
timelogger.log('"02_stg".base_openalex_works_reduced created')

 :: :: TIMELOGGER STARTED :: | since_start: 0.00 seconds | since_last: 0.00 seconds :: 
Table 02_stg.base_openalex_works_reduced already exists. Overwriting since overwrite_strategy=="overwrite".
Deleting table from Glue Catalog 02_stg base_openalex_works_reduced
Deleting S3 objects from s3://sagemaker-research-methodology-extraction/01_data/02_stg/base_openalex_works_reduced/
s3_parent_target_path:  s3://sagemaker-research-methodology-extraction/01_data/02_stg
 :: "02_stg".base_openalex_works_reduced created | since_start: 1.0 minute, 42.04 seconds | since_last: 1.0 minute, 42.04 seconds :: 


' :: "02_stg".base_openalex_works_reduced created | since_start: 1.0 minute, 42.04 seconds | since_last: 1.0 minute, 42.04 seconds :: '

In [4]:
utils.pd_set_options()
display(wr.athena.read_sql_query("""SELECT * FROM "02_stg".base_openalex_works_reduced LIMIT 5 """, '02_stg'))
display(wr.athena.read_sql_query("""SELECT COUNT(*) AS c FROM "02_stg".base_openalex_works_reduced """, '02_stg'))

Unnamed: 0,id_openalex,id_doi,title,language,primary_topic_id,primary_topic_display_name,primary_topic_subfield_id,primary_topic_subfield_display_name,primary_topic_field_id,primary_topic_field_display_name,primary_topic_domain_id,primary_topic_domain_display_name
0,269677805,,2. Profil type du détenu politique à Eysses,fr,T10153,"Education, sociology, and vocational training",3312,Sociology and Political Science,33,Social Sciences,2,Social Sciences
1,2696780303,,Analisa Struktur Dan Material Speed Bump Dengan Bahan Concrete Foam Untuk Penggerak Tenaga Listrik,id,T13674,Computer Science and Engineering,1702,Artificial Intelligence,17,Computer Science,3,Physical Sciences
2,2696784097,,Strategies of survival during the holocaust,en,T11203,Jewish and Middle Eastern Studies,3312,Sociology and Political Science,33,Social Sciences,2,Social Sciences
3,2696788076,,Business intelligence and Marketing analytics/Inteligencia de negocio y análisis de datos,,T11891,Big Data and Business Intelligence,1404,Management Information Systems,14,"Business, Management and Accounting",2,Social Sciences
4,2696777682,,El papel transversal de la lectura en el currículo,es,T13061,Literacy and Educational Practices,3304,Education,33,Social Sciences,2,Social Sciences


Unnamed: 0,c
0,270051911


In [5]:
timelogger = utils.TimeLogger()
utils.create_table_from_sql_file(
    database_name = '02_stg',
    table_name = 'base_semanticscholar_s2orcv2',
    overwrite_strategy='overwrite', # options: fail, overwrite, ignore
    wait=True,
)
timelogger.log('"02_stg".base_semanticscholar_s2orcv2 created')

 :: :: TIMELOGGER STARTED :: | since_start: 0.00 seconds | since_last: 0.00 seconds :: 
Table 02_stg.base_semanticscholar_s2orcv2 already exists. Overwriting since overwrite_strategy=="overwrite".
Deleting table from Glue Catalog 02_stg base_semanticscholar_s2orcv2
Deleting S3 objects from s3://sagemaker-research-methodology-extraction/01_data/02_stg/base_semanticscholar_s2orcv2/
s3_parent_target_path:  s3://sagemaker-research-methodology-extraction/01_data/02_stg
 :: "02_stg".base_semanticscholar_s2orcv2 created | since_start: 11.0 minutes, 4.77 seconds | since_last: 11.0 minutes, 4.77 seconds :: 


' :: "02_stg".base_semanticscholar_s2orcv2 created | since_start: 11.0 minutes, 4.77 seconds | since_last: 11.0 minutes, 4.77 seconds :: '

In [6]:
utils.pd_set_options(cols=500)
display(wr.athena.read_sql_query("""SELECT * FROM "02_stg".base_semanticscholar_s2orcv2 LIMIT 3 """, '02_stg'))
display(wr.athena.read_sql_query("""SELECT COUNT(*) AS c FROM "02_stg".base_semanticscholar_s2orcv2 """, '02_stg'))

Unnamed: 0,id_semanticscholar,id_mag,id_doi,id_arxiv,title,source_url,openaccess_status,content_text,annotations_paragraph,annotations_section_header,license
0,85913114,2142459664.0,10.5897/AJB2013.12952,,Biochemical and cytological analysis of five cultivars of Cicer (chickpea),https://doi.org/10.5897/AJB2013.12952,GREEN,"\nINTRODUCTION\n\nThe genus Cicer include 33 perennial, eight annual, one unspecified wild species as well as the cultivated ones ( Van der Maesen, 1987). Chickpea is the second most important cool season pulse crop in the world and is grown in at least 33 countries including central and west Asia, South Europe, Ethiopia, North Africa, North and South America and Australia (Ladizinsky and Adler, 1976;Singh and Ocampo, 1997). It is native to South Europe and is the most important pulse crop o...","[{""attributes"":null,""end"":1267,""start"":15},{""attributes"":null,""end"":2151,""start"":1269},{""attributes"":null,""end"":2673,""start"":2153},{""attributes"":null,""end"":2963,""start"":2698},{""attributes"":null,""end"":3235,""start"":2985},{""attributes"":null,""end"":3322,""start"":3237},{""attributes"":null,""end"":3875,""start"":3344},{""attributes"":null,""end"":4521,""start"":3896},{""attributes"":null,""end"":4856,""start"":4558},{""attributes"":null,""end"":5177,""start"":4873},{""attributes"":null,""end"":5594,""start"":5208},{""attributes""...","[{""attributes"":null,""end"":13,""start"":1},{""attributes"":null,""end"":2696,""start"":2675},{""attributes"":null,""end"":2983,""start"":2965},{""attributes"":null,""end"":3342,""start"":3324},{""attributes"":null,""end"":3894,""start"":3877},{""attributes"":null,""end"":4556,""start"":4523},{""attributes"":null,""end"":4871,""start"":4858},{""attributes"":null,""end"":5186,""start"":5179},{""attributes"":null,""end"":5206,""start"":5188},{""attributes"":null,""end"":5613,""start"":5596},{""attributes"":null,""end"":7337,""start"":7320},{""attributes"":nu...",CCBY
1,268714736,,10.3389/fmicb.2024.1359263,,Genomic and phenotypic analyses reveal Paenibacillus polymyxa PJH16 is a potential biocontrol agent against cucumber fusarium wilt,https://pmc.ncbi.nlm.nih.gov/articles/PMC11000672,GOLD,"\nIntroduction\n\nCucumber is an important economic crop. China is the main producer of cucumber, and its planting area and scale have ranked first in the world for many years. One cucumber disease that poses a serious threat to yield and quality is cucumber fusarium wilt caused by the fungus Fusarium oxysporum f. sp. cucumerinum which belongs to Ascomycota (Gao et al., 2014). This fungus is a soil-borne pathogen that can infect plants at any stage of growth. It begins invading the wounds an...","[{""attributes"":null,""end"":822,""start"":15},{""attributes"":null,""end"":3347,""start"":824},{""attributes"":null,""end"":4114,""start"":3349},{""attributes"":null,""end"":4376,""start"":4116},{""attributes"":null,""end"":4578,""start"":4378},{""attributes"":null,""end"":4604,""start"":4580},{""attributes"":null,""end"":5287,""start"":4673},{""attributes"":null,""end"":6002,""start"":5289},{""attributes"":null,""end"":7390,""start"":6042},{""attributes"":null,""end"":7708,""start"":7446},{""attributes"":null,""end"":8282,""start"":7710},{""attributes"":n...","[{""attributes"":{""n"":""1""},""end"":13,""start"":1},{""attributes"":{""n"":""2.1""},""end"":4671,""start"":4606},{""attributes"":{""n"":""2.2""},""end"":6040,""start"":6004},{""attributes"":{""n"":""2.3""},""end"":7444,""start"":7392},{""attributes"":{""n"":""2.4""},""end"":11415,""start"":11356},{""attributes"":{""n"":""2.5""},""end"":11951,""start"":11884},{""attributes"":{""n"":""2.6""},""end"":14741,""start"":14678},{""attributes"":{""n"":""2.7""},""end"":15795,""start"":15762},{""attributes"":{""n"":""2.8""},""end"":17658,""start"":17615},{""attributes"":{""n"":""2.9""},""end"":1...",CCBY
2,787028,2160387886.0,10.1159/000345413,,Laparoscopic Splenectomy in Colorectal Cancer Patients with Chemotherapy-Associated Thrombocytopenia due to Hypersplenism,https://pmc.ncbi.nlm.nih.gov/articles/PMC3531924,GOLD,"\nIntroduction\n\nMetastatic colorectal cancer (mCRC) affects approximately 50,000 people a year [1]. The cornerstone of treatment for these patients is systemic chemotherapy, especially with oxaliplatin-based regimens. Acquired thrombocytopenia is a condition that complicates treatment with many chemotherapy regimens. Of the known mechanisms of chemotherapy-induced thrombocytopenia, bone marrow suppression is the most common. Oxaliplatin is also associated with two other etiologies of throm...","[{""attributes"":null,""end"":1177,""start"":15},{""attributes"":null,""end"":2286,""start"":1179},{""attributes"":null,""end"":2856,""start"":2297},{""attributes"":null,""end"":3528,""start"":2858},{""attributes"":null,""end"":4223,""start"":3530},{""attributes"":null,""end"":4599,""start"":4225},{""attributes"":null,""end"":4998,""start"":4610},{""attributes"":null,""end"":5480,""start"":5000},{""attributes"":null,""end"":6490,""start"":5482},{""attributes"":null,""end"":7621,""start"":6504},{""attributes"":null,""end"":8226,""start"":7623},{""attributes""...","[{""attributes"":null,""end"":13,""start"":1},{""attributes"":null,""end"":2295,""start"":2288},{""attributes"":null,""end"":4608,""start"":4601},{""attributes"":null,""end"":6502,""start"":6492},{""attributes"":null,""end"":10853,""start"":10833}]",CCBYNC


Unnamed: 0,c
0,11609787


In [3]:
timelogger = utils.TimeLogger()
utils.create_table_from_sql_file(
    database_name = '02_stg',
    table_name = 'base_arxiv_metadata',
    overwrite_strategy='overwrite', # options: fail, overwrite, ignore
    wait=True,
)
timelogger.log('"02_stg".base_arxiv_metadata created')

 :: :: TIMELOGGER STARTED :: | since_start: 0.00 seconds | since_last: 0.00 seconds :: 
Deleting S3 objects from s3://sagemaker-research-methodology-extraction/01_data/02_stg/base_arxiv_metadata/
s3_parent_target_path:  s3://sagemaker-research-methodology-extraction/01_data/02_stg
 :: "02_stg".base_arxiv_metadata created | since_start: 13.36 seconds | since_last: 13.36 seconds :: 


' :: "02_stg".base_arxiv_metadata created | since_start: 13.36 seconds | since_last: 13.36 seconds :: '

In [4]:
utils.pd_set_options(cols=500)
display(wr.athena.read_sql_query("""SELECT * FROM "02_stg".base_arxiv_metadata LIMIT 5 """, '02_stg'))
display(wr.athena.read_sql_query("""SELECT COUNT(*) AS c FROM "02_stg".base_arxiv_metadata """, '02_stg'))

Unnamed: 0,arxiv_id,doi_id,title,abstract,license
0,2502.12199,,Discrete isoperimetric inequalities on the strong products of paths,"For a graph $G=(V,\ E)$ and a nonempty set $S\subseteq V$, the \emph{vertex boundary} of $S$, denoted by $\partial_G(S)$, is defined to be the set of vertices that are not in $S$ but are adjacent to some vertex in $S$. In this paper, we focus on the strong products of paths, and study when the size of the vertex boundary of a set of $k$ vertices is minimized. We give a conjecture regarding the $n$-dimensional strong product of infinite paths, and prove it for the $2$-dimensional case. Also, ...",ArXiv nonexclusive-distrib
1,2502.122,,Efficient and Effective Prompt Tuning via Prompt Decomposition and Compressed Outer Product,"Prompt tuning (PT) offers a cost-effective alternative to fine-tuning large-scale pre-trained language models (PLMs), requiring only a few parameters in soft prompt tokens added before the input text. However, existing PT approaches face two significant issues: (i) They overlook intrinsic semantic associations between soft prompt tokens, leading to high discreteness and limited interactions, thus reducing the model's comprehension and effectiveness in complex tasks. (ii) Due to the complexit...",CCBY
2,2502.12197,,A Closer Look at System Prompt Robustness,"System prompts have emerged as a critical control surface for specifying the behavior of LLMs in chat and agent settings. Developers depend on system prompts to specify important context, output format, personalities, guardrails, content policies, and safety countermeasures, all of which require models to robustly adhere to the system prompt, especially when facing conflicting or adversarial user inputs. In practice, models often forget to consider relevant guardrails or fail to resolve conf...",CCBYSA
3,2502.12198,,Maximize Your Diffusion: A Study into Reward Maximization and Alignment for Diffusion-based Control,"Diffusion-based planning, learning, and control methods present a promising branch of powerful and expressive decision-making solutions. Given the growing interest, such methods have undergone numerous refinements over the past years. However, despite these advancements, existing methods are limited in their investigations regarding general methods for reward maximization within the decision-making process. In this work, we study extensions of fine-tuning approaches for control applications....",CCBY
4,2502.12195,,GeneralizeFormer: Layer-Adaptive Model Generation across Test-Time Distribution Shifts,"We consider the problem of test-time domain generalization, where a model is trained on several source domains and adjusted on target domains never seen during training. Different from the common methods that fine-tune the model or adjust the classifier parameters online, we propose to generate multiple layer parameters on the fly during inference by a lightweight meta-learned transformer, which we call \textit{GeneralizeFormer}. The layer-wise parameters are generated per target batch witho...",CCBY


Unnamed: 0,c
0,2816721
