In [2]:
import requests
import boto3
import json
import os
import sys
import argparse
import importlib
import transformers
import torch
import pathlib
import awswrangler as wr
from IPython.display import display
from sagemaker.huggingface.processing import HuggingFaceProcessor
from sagemaker.sklearn.processing import SKLearnProcessor
from sagemaker.processing import FrameworkProcessor
from sagemaker.sklearn.estimator import SKLearn
from sagemaker.workflow.steps import ProcessingStep
from sagemaker.workflow.pipeline_context import PipelineSession
from sagemaker.processing import ProcessingInput, ProcessingOutput
from sagemaker.session import get_execution_role


# Adding ../01_modules or ./01_modules to the system path so that we can load modules from 
# there as well
if '__file__' in globals():
    script_dir = pathlib.Path(__file__).parent.resolve()
else:
    script_dir = pathlib.Path().absolute()
modules_path_in_dev = os.path.abspath(os.path.join(script_dir, '..', '01_modules'))
modules_path_in_prod = os.path.abspath(os.path.join(script_dir, '01_modules'))
if os.path.exists(modules_path_in_dev):
    sys.path.append(modules_path_in_dev)
if os.path.exists(modules_path_in_prod):
    sys.path.append(modules_path_in_prod)


# # Jupyter only reads a local module the first time after 
# # kernel start. Re-running a cell with 
# # "from mymodulename import *" would not change
# # anything, even if the imported module has since changed.
# # As a workaround, we need to directly load the module, 
# # use importlib.reload to reload it and then import * 
import utils
_ = importlib.reload(utils)
import config
_ = importlib.reload(config) 


sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/sagemaker-user/.config/sagemaker/config.yaml
config.py loaded: v0.1
utils.py loaded: v0.2.12
utils.py loaded: v0.2.12
config.py loaded: v0.1


In [2]:
if False: # do not run this again, takes a day
    execution_role = get_execution_role()
    source_dir = os.path.abspath(os.path.join(os.getcwd(), '..', '..'))
    print('source_dir:', source_dir)
    sklearn_processor = FrameworkProcessor(
        estimator_cls=SKLearn,
        framework_version='1.2-1', # The newest supported version by sagemaker
        instance_type='ml.c7i.16xlarge',
        instance_count=1,
        base_job_name=f'openalex_works_reduction'.replace('_','-'),
        role=execution_role
    )
    
    step_args = sklearn_processor.run(
        code='src/03_transformation/03_11_transformation_openalex_works_reduction.py',
        source_dir=source_dir,
        inputs=[], # We are not using automatic input-output mapping, instead we handle everything in the script directly on S3
        outputs=[],
        arguments=[
            '--runtype', 'prod',
            '--file-max-limit', '10000',
        ],
        wait=True
    )

In [3]:
timelogger = utils.TimeLogger()
utils.create_table_from_sql_file(
    database_name = '02_stg',
    table_name = 'base_openalex_works_reduced',
    overwrite_strategy='overwrite', # options: fail, overwrite, ignore
    wait=True,
)
timelogger.log('"02_stg".base_openalex_works_reduced created')

 :: :: TIMELOGGER STARTED :: | since_start: 0.00 seconds | since_last: 0.00 seconds :: 
Table 02_stg.base_openalex_works_reduced already exists. Overwriting since overwrite_strategy=="overwrite".
Deleting table from Glue Catalog 02_stg base_openalex_works_reduced
Deleting S3 objects from s3://sagemaker-research-methodology-extraction/01_data/02_stg/base_openalex_works_reduced/
s3_parent_target_path:  s3://sagemaker-research-methodology-extraction/01_data/02_stg
 :: "02_stg".base_openalex_works_reduced created | since_start: 1.0 minute, 42.04 seconds | since_last: 1.0 minute, 42.04 seconds :: 


' :: "02_stg".base_openalex_works_reduced created | since_start: 1.0 minute, 42.04 seconds | since_last: 1.0 minute, 42.04 seconds :: '

In [4]:
utils.pd_set_options()
display(wr.athena.read_sql_query("""SELECT * FROM "02_stg".base_openalex_works_reduced LIMIT 5 """, '02_stg'))
display(wr.athena.read_sql_query("""SELECT COUNT(*) AS c FROM "02_stg".base_openalex_works_reduced """, '02_stg'))

Unnamed: 0,id_openalex,id_doi,title,language,primary_topic_id,primary_topic_display_name,primary_topic_subfield_id,primary_topic_subfield_display_name,primary_topic_field_id,primary_topic_field_display_name,primary_topic_domain_id,primary_topic_domain_display_name
0,269677805,,2. Profil type du détenu politique à Eysses,fr,T10153,"Education, sociology, and vocational training",3312,Sociology and Political Science,33,Social Sciences,2,Social Sciences
1,2696780303,,Analisa Struktur Dan Material Speed Bump Dengan Bahan Concrete Foam Untuk Penggerak Tenaga Listrik,id,T13674,Computer Science and Engineering,1702,Artificial Intelligence,17,Computer Science,3,Physical Sciences
2,2696784097,,Strategies of survival during the holocaust,en,T11203,Jewish and Middle Eastern Studies,3312,Sociology and Political Science,33,Social Sciences,2,Social Sciences
3,2696788076,,Business intelligence and Marketing analytics/Inteligencia de negocio y análisis de datos,,T11891,Big Data and Business Intelligence,1404,Management Information Systems,14,"Business, Management and Accounting",2,Social Sciences
4,2696777682,,El papel transversal de la lectura en el currículo,es,T13061,Literacy and Educational Practices,3304,Education,33,Social Sciences,2,Social Sciences


Unnamed: 0,c
0,270051911


In [3]:
table_name = 'base_semanticscholar_s2orcv2'
timelogger = utils.TimeLogger()
utils.create_table_from_sql_file(
    database_name = '02_stg',
    table_name = table_name,
    overwrite_strategy='overwrite', # options: fail, overwrite, ignore
    wait=True,
)
timelogger.log(f'"02_stg".{table_name} created')

utils.pd_set_options(cols=500)
display(wr.athena.read_sql_query(f"""SELECT * FROM "02_stg".{table_name} LIMIT 3 """, '02_stg'))
display(wr.athena.read_sql_query(f"""SELECT COUNT(*) AS c FROM "02_stg".{table_name} """, '02_stg'))
timelogger.log(f'"02_stg".{table_name} queries finished')

 :: :: TIMELOGGER STARTED :: | since_start: 0.00 seconds | since_last: 0.00 seconds :: 
Table 02_stg.base_semanticscholar_s2orcv2 already exists. Overwriting since overwrite_strategy=="overwrite".
Deleting table from Glue Catalog 02_stg base_semanticscholar_s2orcv2
Deleting S3 objects from s3://sagemaker-research-methodology-extraction/01_data/02_stg/base_semanticscholar_s2orcv2/
s3_parent_target_path:  s3://sagemaker-research-methodology-extraction/01_data/02_stg
 :: "02_stg".base_semanticscholar_s2orcv2 created | since_start: 10.0 minutes, 56.91 seconds | since_last: 10.0 minutes, 56.91 seconds :: 


Unnamed: 0,id_semanticscholar,id_mag,id_doi,id_arxiv,title,source_url,openaccess_status,content_text,annotations_paragraph,annotations_section_header,license
0,268177633,,10.58258/jisip.v8i1.6259,,"Optimizing Land Acquisition Services Based on Involvement and Collaboration Between Group Governments and Communities in Telaga Bertong Subdistrict, West Sumbawa Regency",https://doi.org/10.58258/jisip.v8i1.6259,GOLD,"\nINTRODUCTION\n\nThe community involvement process in this case is divided into two stages, namely the preparation stage and the implementation stage. In the preparation stage, community participation includes the presence of the community in public consultation activities carried out by the government. In public consultation activities, the aims and objectives of public development are conveyed. Indicators of community participation are divided into 2 (dula), namely through knowledge and u...","[{""attributes"":null,""end"":883,""start"":15},{""attributes"":null,""end"":1533,""start"":885},{""attributes"":null,""end"":1786,""start"":1535},{""attributes"":null,""end"":2157,""start"":1788},{""attributes"":null,""end"":2796,""start"":2176},{""attributes"":null,""end"":3326,""start"":2798},{""attributes"":null,""end"":4386,""start"":3361},{""attributes"":null,""end"":4606,""start"":4468},{""attributes"":null,""end"":5499,""start"":4608},{""attributes"":null,""end"":5649,""start"":5585},{""attributes"":null,""end"":6159,""start"":5651},{""attributes"":n...","[{""attributes"":{""n"":""1.""},""end"":13,""start"":1},{""attributes"":{""n"":""2.""},""end"":2174,""start"":2159},{""attributes"":{""n"":""3.""},""end"":3359,""start"":3328},{""attributes"":{""n"":""2.""},""end"":4466,""start"":4388},{""attributes"":{""n"":""3.""},""end"":5583,""start"":5501},{""attributes"":{""n"":""2.""},""end"":8199,""start"":8093},{""attributes"":{""n"":""3.""},""end"":9093,""start"":9013},{""attributes"":null,""end"":9169,""start"":9153},{""attributes"":{""n"":""4.""},""end"":13711,""start"":13650},{""attributes"":{""n"":""2.""},""end"":14094,""start"":14036}]",CCBYSA
1,399918,2964337647.0,,1401.7437,Conceptual Framework for Internet of Things' Virtualization via OpenFlow in Context-aware Networks,https://arxiv.org/abs/1401.7437,,"\nIntroduction\n\nThe Internet of Things (IoT) can be outlined in a universal network frame supported by regular and interoperable network protocols in which sensible and virtual ""things"" are incorporated into the co mmunicat ion network. 'Th ings', by definition, resembles to any physical object that is capable to interconnect with each other and participate to develop the concept of e-services out of context information received fro m Internet of Things [1]; The concept of IoT enormously s...","[{""attributes"":null,""end"":946,""start"":15},{""attributes"":null,""end"":2786,""start"":948},{""attributes"":null,""end"":3042,""start"":2788},{""attributes"":null,""end"":5759,""start"":3044},{""attributes"":null,""end"":6078,""start"":5761},{""attributes"":null,""end"":6853,""start"":6092},{""attributes"":null,""end"":7741,""start"":6855},{""attributes"":null,""end"":8418,""start"":7743},{""attributes"":null,""end"":9524,""start"":8420},{""attributes"":null,""end"":10285,""start"":9526},{""attributes"":null,""end"":10799,""start"":10287},{""attributes...","[{""attributes"":{""n"":""1.""},""end"":13,""start"":1},{""attributes"":{""n"":""2.""},""end"":6090,""start"":6080},{""attributes"":{""n"":""3.""},""end"":11664,""start"":11644},{""attributes"":{""n"":""3.1.""},""end"":13193,""start"":13175},{""attributes"":{""n"":""3.2.""},""end"":14382,""start"":14370},{""attributes"":null,""end"":15640,""start"":15623},{""attributes"":{""n"":""3.3.""},""end"":15659,""start"":15642},{""attributes"":{""n"":""3.4.""},""end"":17068,""start"":17055},{""attributes"":{""n"":""4.""},""end"":18985,""start"":18957},{""attributes"":{""n"":""5.""},""end"":196...",
2,273910017,,10.1051/e3sconf/202458506003,,Recognizing Attitudes and Expectations about the Role of Advanced Nurses (ANP’s) in Kosovo’s Healthcare System,https://doi.org/10.1051/e3sconf/202458506003,GOLD,"\nBackground\n\nThe role of Advanced Nurse Practitioners (ANPs) has gained increasing recognition globally due to the expanding scope of healthcare and the need for more specialized and efficient healthcare delivery. \n\nANPs are registered nurses with advanced clinical education, skills, and expertise, often holding master's or doctoral degrees. They are trained to provide a wide range of healthcare services, including diagnosis and treatment of medical conditions, prescribing medications, ...","[{""attributes"":null,""end"":214,""start"":13},{""attributes"":null,""end"":722,""start"":216},{""attributes"":null,""end"":1157,""start"":724},{""attributes"":null,""end"":1540,""start"":1159},{""attributes"":null,""end"":2030,""start"":1542},{""attributes"":null,""end"":2560,""start"":2080},{""attributes"":null,""end"":2943,""start"":2562},{""attributes"":null,""end"":3444,""start"":2987},{""attributes"":null,""end"":4129,""start"":3446},{""attributes"":null,""end"":4680,""start"":4131},{""attributes"":null,""end"":5123,""start"":4682},{""attributes"":nul...","[{""attributes"":{""n"":""1""},""end"":11,""start"":1},{""attributes"":{""n"":""1.1""},""end"":2078,""start"":2032},{""attributes"":{""n"":""1.2""},""end"":2985,""start"":2945},{""attributes"":{""n"":""3""},""end"":5677,""start"":5666},{""attributes"":null,""end"":7166,""start"":7163},{""attributes"":null,""end"":12060,""start"":12046},{""attributes"":null,""end"":12084,""start"":12062},{""attributes"":{""n"":""5""},""end"":14942,""start"":14919}]",CCBY


Unnamed: 0,c
0,11609787


 :: "02_stg".base_semanticscholar_s2orcv2 queries finished | since_start: 11.0 minutes, 7.66 seconds | since_last: 10.75 seconds :: 


' :: "02_stg".base_semanticscholar_s2orcv2 queries finished | since_start: 11.0 minutes, 7.66 seconds | since_last: 10.75 seconds :: '

In [4]:
table_name = 'stg_semanticscholar_combined_works'
timelogger = utils.TimeLogger()
utils.create_table_from_sql_file(
    database_name = '02_stg',
    table_name = table_name,
    overwrite_strategy='overwrite', # options: fail, overwrite, ignore
    wait=True,
)
timelogger.log(f'"02_stg".{table_name} created')

utils.pd_set_options(cols=500)
display(wr.athena.read_sql_query(f"""SELECT * FROM "02_stg".{table_name} LIMIT 3 """, '02_stg'))
display(wr.athena.read_sql_query(f"""SELECT COUNT(*) AS c FROM "02_stg".{table_name} """, '02_stg'))
timelogger.log(f'"02_stg".{table_name} queries finished')

 :: :: TIMELOGGER STARTED :: | since_start: 0.00 seconds | since_last: 0.00 seconds :: 
Table 02_stg.stg_semanticscholar_combined_works already exists. Overwriting since overwrite_strategy=="overwrite".
Deleting table from Glue Catalog 02_stg stg_semanticscholar_combined_works
Deleting S3 objects from s3://sagemaker-research-methodology-extraction/01_data/02_stg/stg_semanticscholar_combined_works/
s3_parent_target_path:  s3://sagemaker-research-methodology-extraction/01_data/02_stg
 :: "02_stg".stg_semanticscholar_combined_works created | since_start: 14.0 minutes, 11.26 seconds | since_last: 14.0 minutes, 11.26 seconds :: 


Unnamed: 0,id_semanticscholar,id_mag,id_doi,id_arxiv,title,source_url,openaccess_status,content_text,annotations_paragraph,annotations_section_header,license,content_abstract,publication_year,publication_date
0,7631702,2068802494.0,10.1155/2014/543478,,Human Gaze Following Response Is Affected by Visual Acuity,https://pmc.ncbi.nlm.nih.gov/articles/PMC3997985,GOLD,"\nIntroduction\n\nVision is the ability to observe the world by interpreting light that is reflected from the surroundings and reaches the retina. Loss of visual function can severely affect daily human activities and may effectively decrease the quality of life [1,2]. Loss of vision can be caused by various ocular diseases, such as retinitis pigmentosa, macular degeneration [3], or glaucoma. Loss of vision can also occur as a symptom of other disorders like multiple sclerosis [4] or diabeti...","[{""attributes"":null,""end"":508,""start"":15},{""attributes"":null,""end"":1382,""start"":510},{""attributes"":null,""end"":2239,""start"":1384},{""attributes"":null,""end"":2917,""start"":2241},{""attributes"":null,""end"":4030,""start"":2919},{""attributes"":null,""end"":5318,""start"":4032},{""attributes"":null,""end"":5693,""start"":5343},{""attributes"":null,""end"":6450,""start"":5711},{""attributes"":null,""end"":6895,""start"":6488},{""attributes"":null,""end"":7923,""start"":6897},{""attributes"":null,""end"":8047,""start"":7925},{""attributes"":n...","[{""attributes"":{""n"":""1.""},""end"":13,""start"":1},{""attributes"":{""n"":""2.""},""end"":5341,""start"":5320},{""attributes"":{""n"":""2.2.""},""end"":5709,""start"":5695},{""attributes"":{""n"":""2.3.""},""end"":6486,""start"":6452},{""attributes"":null,""end"":8076,""start"":8049},{""attributes"":{""n"":""2.4.""},""end"":9758,""start"":9749},{""attributes"":{""n"":""3.""},""end"":10967,""start"":10960},{""attributes"":{""n"":""4.""},""end"":13985,""start"":13975},{""attributes"":{""n"":""5.""},""end"":19741,""start"":19731}]",CCBY,"The present study investigated how gaze following eye movements are affected by stimulus contrast and spatial frequency and by aberrations in central visual acuity due to refractive errors. We measured 30 healthy subjects with a range of visual acuities but without any refractive correction. Visual acuity was tested using a Landolt-C chart. Subjects were divided into three groups with low, intermediate, or good visual acuity. Gaze following responses (GFR) to moving Gabor patches were record...",2014,2014-04-06
1,52134621,2884767496.0,10.4102/curationis.v41i1.1931,,Record-keeping: Challenges experienced by nurses in selected public hospitals,https://pmc.ncbi.nlm.nih.gov/articles/PMC6111626,GOLD,"\nIntroduction and background\n\nGood nursing practice requires detailed record-keeping that is comprehensive, timely and accurate.Without complete recording there is no evidence to prove that care was provided to the patient, and in nursing practice there is a saying that 'what is not recorded has not been done' (Marinic 2015;Taiye 2015).Furthermore, poor record-keeping not only undermines patient care but makes the nurses more vulnerable to legal claims which arise from breakdown in commun...","[{""attributes"":null,""end"":1617,""start"":30},{""attributes"":null,""end"":2881,""start"":1619},{""attributes"":null,""end"":3676,""start"":2883},{""attributes"":null,""end"":4833,""start"":3697},{""attributes"":null,""end"":5005,""start"":4857},{""attributes"":null,""end"":5305,""start"":5035},{""attributes"":null,""end"":5603,""start"":5307},{""attributes"":null,""end"":5875,""start"":5605},{""attributes"":null,""end"":6148,""start"":5877},{""attributes"":null,""end"":6338,""start"":6177},{""attributes"":null,""end"":6610,""start"":6368},{""attributes""...","[{""attributes"":null,""end"":28,""start"":1},{""attributes"":null,""end"":3695,""start"":3678},{""attributes"":null,""end"":4855,""start"":4835},{""attributes"":null,""end"":5033,""start"":5007},{""attributes"":null,""end"":6175,""start"":6150},{""attributes"":null,""end"":6366,""start"":6340},{""attributes"":null,""end"":6632,""start"":6612},{""attributes"":null,""end"":6763,""start"":6740},{""attributes"":null,""end"":7154,""start"":7139},{""attributes"":null,""end"":8037,""start"":8024},{""attributes"":null,""end"":8683,""start"":8668},{""attributes"":nu...",CCBY,"Background Patients’ records provide a trace of care processes that have occurred and are further used as communication amongst nurses for continued management of patients. Nurses have the responsibility to ensure that records are accurate and complete in order to effectively manage their patients. In hospitals, nurses have to record a wide range of information in the patient’s records and this leads to increased workload on the part of nurses that compromises accurate record-keeping. Object...",2018,2018-07-30
2,252817053,,10.1007/s00415-022-11411-5,,Efficacy and safety of very early mobilization after thrombolysis in acute ischemic stroke: a randomized clinical trial,https://pmc.ncbi.nlm.nih.gov/articles/PMC9552146,BRONZE,\nIntroduction\n\nPractical clinical guidelines recommend two key treatments for acute ischemic stroke: care in units specializing in stroke management and thrombolysis with recombinant tissue plasminogen activator (rtPA). Thrombolysis is a specific treatment for acute ischemic stroke when given within the first 4.5 h of ischemic stroke onset [1]. \n\nIt may be administered to patients who wake up with stroke symptoms (wake-up stroke) or have an uncertain onset time > 4.5 h since last seen a...,"[{""attributes"":null,""end"":347,""start"":15},{""attributes"":null,""end"":667,""start"":349},{""attributes"":null,""end"":1294,""start"":669},{""attributes"":null,""end"":1674,""start"":1296},{""attributes"":null,""end"":2011,""start"":1676},{""attributes"":null,""end"":2254,""start"":2013},{""attributes"":null,""end"":2662,""start"":2256},{""attributes"":null,""end"":2842,""start"":2664},{""attributes"":null,""end"":3141,""start"":2881},{""attributes"":null,""end"":3514,""start"":3157},{""attributes"":null,""end"":4217,""start"":3516},{""attributes"":nul...","[{""attributes"":null,""end"":13,""start"":1},{""attributes"":null,""end"":2865,""start"":2844},{""attributes"":null,""end"":2879,""start"":2867},{""attributes"":null,""end"":3155,""start"":3143},{""attributes"":null,""end"":4232,""start"":4219},{""attributes"":null,""end"":5360,""start"":5352},{""attributes"":null,""end"":7763,""start"":7752},{""attributes"":null,""end"":8468,""start"":8455},{""attributes"":null,""end"":9102,""start"":9082},{""attributes"":null,""end"":9917,""start"":9902},{""attributes"":null,""end"":10369,""start"":10362},{""attributes"":...",,"Background Stroke has a deleterious impact on human health due to its high incidence, degree of disabling sequelae and mortality, constituting one of the main causes of death and disability worldwide. Objectives This study aimed to assess the efficacy and safety of very early mobilization (VEMG) after thrombolysis in functional recovery in patients with acute ischemic stroke. Methods The present study was an open, prospective, randomized study, with no blinded outcome, carried out in the str...",2022,2022-10-11


Unnamed: 0,c
0,11609787


 :: "02_stg".stg_semanticscholar_combined_works queries finished | since_start: 14.0 minutes, 20.19 seconds | since_last: 8.94 seconds :: 


' :: "02_stg".stg_semanticscholar_combined_works queries finished | since_start: 14.0 minutes, 20.19 seconds | since_last: 8.94 seconds :: '

In [2]:
timelogger = utils.TimeLogger()
utils.create_table_from_sql_file(
    database_name = '02_stg',
    table_name = 'base_arxiv_metadata',
    overwrite_strategy='overwrite', # options: fail, overwrite, ignore
    wait=True,
)
timelogger.log('"02_stg".base_arxiv_metadata created')

 :: :: TIMELOGGER STARTED :: | since_start: 0.00 seconds | since_last: 0.00 seconds :: 
Table 02_stg.base_arxiv_metadata already exists. Overwriting since overwrite_strategy=="overwrite".
Deleting table from Glue Catalog 02_stg base_arxiv_metadata
Deleting S3 objects from s3://sagemaker-research-methodology-extraction/01_data/02_stg/base_arxiv_metadata/
s3_parent_target_path:  s3://sagemaker-research-methodology-extraction/01_data/02_stg
 :: "02_stg".base_arxiv_metadata created | since_start: 17.13 seconds | since_last: 17.13 seconds :: 


' :: "02_stg".base_arxiv_metadata created | since_start: 17.13 seconds | since_last: 17.13 seconds :: '

In [3]:
utils.pd_set_options(cols=500)
display(wr.athena.read_sql_query("""SELECT * FROM "02_stg".base_arxiv_metadata LIMIT 5 """, '02_stg'))
display(wr.athena.read_sql_query("""SELECT COUNT(*) AS c FROM "02_stg".base_arxiv_metadata """, '02_stg'))

Unnamed: 0,id_arxiv,id_doi,title,abstract,license
0,1810.00965,,Natural measures of alignment,Natural coordinate system will be proposed. In this coordinate system alignment procedure of a device and a detector can be easily performed. This approach is generalization of previous specific formulas in the field of calibration and provide top level description of the procedure. A basic example application to linac therapy plan is also provided.,ArXiv nonexclusive-distrib
1,1810.00967,,Efficient and Accurate Abnormality Mining from Radiology Reports with Customized False Positive Reduction,"Obtaining datasets labeled to facilitate model development is a challenge for most machine learning tasks. The difficulty is heightened for medical imaging, where data itself is limited in accessibility and labeling requires costly time and effort by trained medical specialists. Medical imaging studies, however, are often accompanied by a medical report produced by a radiologist, identifying important features on the corresponding scan for other physicians not specifically trained in radiolo...",ArXiv nonexclusive-distrib
2,1810.00952,10.1145/3211346.3211348,Relay: A New IR for Machine Learning Frameworks,"Machine learning powers diverse services in industry including search, translation, recommendation systems, and security. The scale and importance of these models require that they be efficient, expressive, and portable across an array of heterogeneous hardware devices. These constraints are often at odds; in order to better accommodate them we propose a new high-level intermediate representation (IR) called Relay. Relay is being designed as a purely-functional, statically-typed language wit...",ArXiv nonexclusive-distrib
3,1810.00956,,Challenges of Using Text Classifiers for Causal Inference,"Causal understanding is essential for many kinds of decision-making, but causal inference from observational data has typically only been applied to structured, low-dimensional datasets. While text classifiers produce low-dimensional outputs, their use in causal inference has not previously been studied. To facilitate causal analyses based on language data, we consider the role that text classifiers can play in causal inference through established modeling mechanisms from the causality liter...",ArXiv nonexclusive-distrib
4,1810.00958,,Measurement of the neutron lifetime using a magneto-gravitational trap,"Precision measurements of the free neutron lifetime $\tau_n$, when combined with measurements of the axial vector form factor, can be used to test unitarity of the CKM matrix. Non-unitarity is a signal for physics Beyond the Standard Model (BSM). Sensitivity to BSM physics requires measurements of $\tau_n$ to a precision of 0.1~s. However, the two dominant techniques to measure $\tau_n$ (colloquially beam and bottle measurements) disagree by nearly 10~s. UCN$\tau$ is a neutron lifetime exper...",ArXiv nonexclusive-distrib


Unnamed: 0,c
0,2816721


In [5]:
table_name = 'stg_unified_works_01_joined_to_arxiv'
timelogger = utils.TimeLogger()
utils.create_table_from_sql_file(
    database_name = '02_stg',
    table_name = table_name,
    overwrite_strategy='overwrite', # options: fail, overwrite, ignore
    wait=True,
)
timelogger.log(f'"02_stg".{table_name} created')

utils.pd_set_options(cols=500)
display(wr.athena.read_sql_query(f"""SELECT * FROM "02_stg".{table_name} LIMIT 3 """, '02_stg'))
display(wr.athena.read_sql_query(f"""SELECT COUNT(*) AS c FROM "02_stg".{table_name} """, '02_stg'))
timelogger.log(f'"02_stg".{table_name} queries finished')

 :: :: TIMELOGGER STARTED :: | since_start: 0.00 seconds | since_last: 0.00 seconds :: 
Deleting S3 objects from s3://sagemaker-research-methodology-extraction/01_data/02_stg/stg_unified_works_01_joined_to_arxiv/
s3_parent_target_path:  s3://sagemaker-research-methodology-extraction/01_data/02_stg
 :: "02_stg".stg_unified_works_01_joined_to_arxiv created | since_start: 11.0 minutes, 30.35 seconds | since_last: 11.0 minutes, 30.35 seconds :: 


Unnamed: 0,id_semanticscholar,id_mag,id_doi,id_arxiv,title,source_url,openaccess_status,content_text,annotations_paragraph,annotations_section_header,content_abstract,publication_year,publication_date,license,license_allows_derivative_reuse
0,271885322,,10.1177/2325967124S00292,,Poster 326: Near Complete Quadriceps Tendon Healing 2 Years Following Harvest in Anterior Cruciate Ligament Reconstruction,https://pmc.ncbi.nlm.nih.gov/articles/PMC11328263,GOLD,"\nObjectives: The quadriceps tendon (QT) autograft has garnered popularity for anterior cruciate ligament (ACL) reconstruction (ACLR) in the last decade. However, as with other autograft choices, re-ruptures of QT autografts can occur especially in high-level athletes, with reported rates ranging from 2%-22%. ACL revision surgery is an option for these athletes, but it can be difficult for surgeons to decide what type of graft to select. Numerous studies have examined postoperative morpholog...","[{""attributes"":null,""end"":1032,""start"":1},{""attributes"":null,""end"":2982,""start"":1044},{""attributes"":null,""end"":4066,""start"":2984},{""attributes"":null,""end"":4487,""start"":4068},{""attributes"":null,""end"":4498,""start"":4489},{""attributes"":null,""end"":4511,""start"":4510}]","[{""attributes"":null,""end"":1042,""start"":1034}]","Objectives: The quadriceps tendon (QT) autograft has garnered popularity for anterior cruciate ligament (ACL) reconstruction (ACLR) in the last decade. However, as with other autograft choices, re-ruptures of QT autografts can occur especially in high-level athletes, with reported rates ranging from 2%-22%. ACL revision surgery is an option for these athletes, but it can be difficult for surgeons to decide what type of graft to select. Numerous studies have examined postoperative morphologic...",2024,2024-07-01,CCBYNCND,0
1,257612077,,10.3390/ph16030445,,Application of Convergent Science and Technology toward Ocular Disease Treatment,https://pmc.ncbi.nlm.nih.gov/articles/PMC10053244,GOLD,"\nIntroduction\n\nThe eye is one of the most complex sensory organs in the body and is divided into two main parts, namely, the anterior and posterior segments [1]. The anterior segment is the front one-third of the eye which is composed of cornea, trabecular meshwork, conjunctiva, pupil, iris, ciliary body, aqueous humor, and lens, and the posterior segment is the back two-thirds of the eye that comprises the choroid, sclera, retina, macula, vitreous humor, and optic nerve [2]. This complex...","[{""attributes"":null,""end"":654,""start"":15},{""attributes"":null,""end"":2802,""start"":656},{""attributes"":null,""end"":4950,""start"":2804},{""attributes"":null,""end"":6241,""start"":4981},{""attributes"":null,""end"":6895,""start"":6243},{""attributes"":null,""end"":7554,""start"":6897},{""attributes"":null,""end"":8154,""start"":7556},{""attributes"":null,""end"":8819,""start"":8156},{""attributes"":null,""end"":9213,""start"":8821},{""attributes"":null,""end"":10156,""start"":9215},{""attributes"":null,""end"":11508,""start"":10158},{""attributes...","[{""attributes"":{""n"":""1.""},""end"":13,""start"":1},{""attributes"":{""n"":""2.""},""end"":4979,""start"":4952},{""attributes"":{""n"":""3.""},""end"":11560,""start"":11510},{""attributes"":{""n"":""3.1.""},""end"":13577,""start"":13514},{""attributes"":{""n"":""3.2.""},""end"":19843,""start"":19801},{""attributes"":{""n"":""4.""},""end"":30564,""start"":30481},{""attributes"":{""n"":""4.1.""},""end"":30630,""start"":30566},{""attributes"":null,""end"":33865,""start"":33792},{""attributes"":null,""end"":35159,""start"":35142},{""attributes"":null,""end"":35472,""start"":354...","Eyes are one of the main critical organs of the body that provide our brain with the most information about the surrounding environment. Disturbance in the activity of this informational organ, resulting from different ocular diseases, could affect the quality of life, so finding appropriate methods for treating ocular disease has attracted lots of attention. This is especially due to the ineffectiveness of the conventional therapeutic method to deliver drugs into the interior parts of the e...",2023,2023-03-01,CCBY,1
2,81760056,2908600938.0,10.5772/INTECHOPEN.77120,,Use of ECMO in Sepsis and Septic Shock,https://doi.org/10.5772/INTECHOPEN.77120,HYBRID,"\nIntroduction\n\nSince the publication in the early 1970s of the first successful use of extracorporeal membrane oxygenation (ECMO) in a post-traumatic adult respiratory distress (ARDS) patient [1], ECMO has been used tremendously. The approach can be either by veno-venous cannulation, which is mainly used in hypoxic respiratory failure, or by veno-arterial cannulation, which is the preferred modality for cardiac (or combined) support. This implements that most indications are in the field ...","[{""attributes"":null,""end"":531,""start"":15},{""attributes"":null,""end"":1333,""start"":533},{""attributes"":null,""end"":1962,""start"":1378},{""attributes"":null,""end"":2336,""start"":1964},{""attributes"":null,""end"":2718,""start"":2338},{""attributes"":null,""end"":2944,""start"":2720},{""attributes"":null,""end"":3188,""start"":2946},{""attributes"":null,""end"":3714,""start"":3190},{""attributes"":null,""end"":4413,""start"":3733},{""attributes"":null,""end"":5171,""start"":4415},{""attributes"":null,""end"":5607,""start"":5173},{""attributes"":n...","[{""attributes"":{""n"":""1.""},""end"":13,""start"":1},{""attributes"":{""n"":""2.""},""end"":1363,""start"":1335},{""attributes"":{""n"":""2.1.1.""},""end"":1376,""start"":1365},{""attributes"":{""n"":""2.1.2.""},""end"":3731,""start"":3716},{""attributes"":{""n"":""2.1.3.""},""end"":8137,""start"":8090},{""attributes"":{""n"":""2.1.3.2.""},""end"":14008,""start"":13979},{""attributes"":{""n"":""2.2.""},""end"":17899,""start"":17891},{""attributes"":{""n"":""3.""},""end"":21043,""start"":21023},{""attributes"":{""n"":""3.1.""},""end"":21388,""start"":21374},{""attributes"":{""n"":""...","The use of extracorporeal membrane oxygenation (ECMO) has always been controversial in the past. Evidence was mainly build up in neonates and much controversy remained in adults. The main adult indications were mechanical support (e.g., in cardiogenic shock) or respiratory support (e.g., in the field of acute respiratory distress syndrome (ARDS)). Sepsis was historically often considered as a contraindication. As a consequence of sev - eral worldwide flu outbreaks, the use of ECMO in infecti...",2018,2018-11-05,CCBY,1


Unnamed: 0,c
0,11609787


 :: "02_stg".stg_unified_works_01_joined_to_arxiv queries finished | since_start: 11.0 minutes, 42.05 seconds | since_last: 11.71 seconds :: 


' :: "02_stg".stg_unified_works_01_joined_to_arxiv queries finished | since_start: 11.0 minutes, 42.05 seconds | since_last: 11.71 seconds :: '