In [1]:
import requests
import boto3
import json
import os
import sys
import argparse
import importlib
import transformers
import torch
import pathlib
import awswrangler as wr
from IPython.display import display
from sagemaker.huggingface.processing import HuggingFaceProcessor
from sagemaker.sklearn.processing import SKLearnProcessor
from sagemaker.processing import FrameworkProcessor
from sagemaker.sklearn.estimator import SKLearn
from sagemaker.workflow.steps import ProcessingStep
from sagemaker.workflow.pipeline_context import PipelineSession
from sagemaker.processing import ProcessingInput, ProcessingOutput
from sagemaker.session import get_execution_role


# Adding ../01_modules or ./01_modules to the system path so that we can load modules from 
# there as well
if '__file__' in globals():
    script_dir = pathlib.Path(__file__).parent.resolve()
else:
    script_dir = pathlib.Path().absolute()
modules_path_in_dev = os.path.abspath(os.path.join(script_dir, '..', '01_modules'))
modules_path_in_prod = os.path.abspath(os.path.join(script_dir, '01_modules'))
if os.path.exists(modules_path_in_dev):
    sys.path.append(modules_path_in_dev)
if os.path.exists(modules_path_in_prod):
    sys.path.append(modules_path_in_prod)


# # Jupyter only reads a local module the first time after 
# # kernel start. Re-running a cell with 
# # "from mymodulename import *" would not change
# # anything, even if the imported module has since changed.
# # As a workaround, we need to directly load the module, 
# # use importlib.reload to reload it and then import * 
import utils
_ = importlib.reload(utils)
import config
_ = importlib.reload(config) 


sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/sagemaker-user/.config/sagemaker/config.yaml
config.py loaded: v0.1
utils.py loaded: v0.2.12
utils.py loaded: v0.2.12
config.py loaded: v0.1


In [2]:
if False: # do not run this again, takes a day
    execution_role = get_execution_role()
    source_dir = os.path.abspath(os.path.join(os.getcwd(), '..', '..'))
    print('source_dir:', source_dir)
    sklearn_processor = FrameworkProcessor(
        estimator_cls=SKLearn,
        framework_version='1.2-1', # The newest supported version by sagemaker
        instance_type='ml.c7i.16xlarge',
        instance_count=1,
        base_job_name=f'openalex_works_reduction'.replace('_','-'),
        role=execution_role
    )
    
    step_args = sklearn_processor.run(
        code='src/03_transformation/03_11_transformation_openalex_works_reduction.py',
        source_dir=source_dir,
        inputs=[], # We are not using automatic input-output mapping, instead we handle everything in the script directly on S3
        outputs=[],
        arguments=[
            '--runtype', 'prod',
            '--file-max-limit', '10000',
        ],
        wait=True
    )

In [30]:
table_name = 'base_openalex_works_reduced_filtered'
timelogger = utils.TimeLogger()
utils.create_table_from_sql_file(
    database_name = '02_stg',
    table_name = table_name,
    overwrite_strategy='overwrite', # options: fail, overwrite, ignore
    wait=True,
)
timelogger.log(f'"02_stg".{table_name} created')

utils.pd_set_options(cols=500)
display(wr.athena.read_sql_query(f"""SELECT * FROM "02_stg".{table_name} LIMIT 3 """, '02_stg'))
display(wr.athena.read_sql_query(f"""SELECT COUNT(*) AS c FROM "02_stg".{table_name} """, '02_stg'))
timelogger.log(f'"02_stg".{table_name} queries finished')

 :: :: TIMELOGGER STARTED :: | since_start: 0.00 seconds | since_last: 0.00 seconds :: 
Deleting S3 objects from s3://sagemaker-research-methodology-extraction/01_data/02_stg/base_openalex_works_reduced_filtered/
s3_parent_target_path:  s3://sagemaker-research-methodology-extraction/01_data/02_stg
 :: "02_stg".base_openalex_works_reduced_filtered created | since_start: 42.36 seconds | since_last: 42.36 seconds :: 


Unnamed: 0,id_openalex,id_doi,title,language,primary_topic_id,primary_topic_display_name,primary_topic_subfield_id,primary_topic_subfield_display_name,primary_topic_field_id,primary_topic_field_display_name,primary_topic_domain_id,primary_topic_domain_display_name
0,2184395120,,AN APPROACH FOR COMPRESSING DIGITAL IMAGES BY USING RUN LENGTH ENCODING,en,T10901,Advanced Data Compression Techniques,1707,Computer Vision and Pattern Recognition,17,Computer Science,3,Physical Sciences
1,2184397778,,LEARNINGMINIMUM VOLUME SETSWITH SUPPORTVECTOR MACHINES,en,T11512,Anomaly Detection Techniques and Applications,1702,Artificial Intelligence,17,Computer Science,3,Physical Sciences
2,2184397944,,Analyzing thePerformance ofVoice over Internet Protocol ina 3GNetwork,en,T10575,Wireless Communication Networks Research,1705,Computer Networks and Communications,17,Computer Science,3,Physical Sciences


Unnamed: 0,c
0,13242469


 :: "02_stg".base_openalex_works_reduced_filtered queries finished | since_start: 50.44 seconds | since_last: 8.08 seconds :: 


' :: "02_stg".base_openalex_works_reduced_filtered queries finished | since_start: 50.44 seconds | since_last: 8.08 seconds :: '

In [3]:
table_name = 'base_semanticscholar_s2orcv2'
timelogger = utils.TimeLogger()
utils.create_table_from_sql_file(
    database_name = '02_stg',
    table_name = table_name,
    overwrite_strategy='overwrite', # options: fail, overwrite, ignore
    wait=True,
)
timelogger.log(f'"02_stg".{table_name} created')

utils.pd_set_options(cols=500)
display(wr.athena.read_sql_query(f"""SELECT * FROM "02_stg".{table_name} LIMIT 3 """, '02_stg'))
display(wr.athena.read_sql_query(f"""SELECT COUNT(*) AS c FROM "02_stg".{table_name} """, '02_stg'))
timelogger.log(f'"02_stg".{table_name} queries finished')

 :: :: TIMELOGGER STARTED :: | since_start: 0.00 seconds | since_last: 0.00 seconds :: 
Table 02_stg.base_semanticscholar_s2orcv2 already exists. Overwriting since overwrite_strategy=="overwrite".
Deleting table from Glue Catalog 02_stg base_semanticscholar_s2orcv2
Deleting S3 objects from s3://sagemaker-research-methodology-extraction/01_data/02_stg/base_semanticscholar_s2orcv2/
s3_parent_target_path:  s3://sagemaker-research-methodology-extraction/01_data/02_stg
 :: "02_stg".base_semanticscholar_s2orcv2 created | since_start: 11.0 minutes, 1.71 seconds | since_last: 11.0 minutes, 1.71 seconds :: 


Unnamed: 0,id_semanticscholar,id_mag,id_doi,id_arxiv,title,source_url,openaccess_status,content_text,annotations_paragraph,annotations_section_header,license
0,1808709,2101143116,10.1186/1476-511x-13-132,,Twist 1 regulates the expression of PPARγ during hormone-induced 3T3-L1 preadipocyte differentiation: a possible role in obesity and associated diseases,https://pmc.ncbi.nlm.nih.gov/articles/PMC4150960,GOLD,"\nBackground\n\nObesity has become an epidemic in the human population, and China has the highest number of obese patients in the world [1]. Because obesity involves an increase in the number of adipocytes, any of the factors involved in adipocyte differentiation might be of great importance for the development of obesity. To date, numerous factors and proteins have been implicated in the generation of new fat cells, including peroxisome proliferator-activated receptor gamma (PPARγ) [2,3], C...","[{""attributes"":null,""end"":781,""start"":13},{""attributes"":null,""end"":2087,""start"":783},{""attributes"":null,""end"":3470,""start"":2089},{""attributes"":null,""end"":4312,""start"":3472},{""attributes"":null,""end"":5329,""start"":4323},{""attributes"":null,""end"":5818,""start"":5409},{""attributes"":null,""end"":6151,""start"":5890},{""attributes"":null,""end"":6744,""start"":6218},{""attributes"":null,""end"":8054,""start"":6870},{""attributes"":null,""end"":8511,""start"":8056},{""attributes"":null,""end"":9031,""start"":8525},{""attributes"":n...","[{""attributes"":null,""end"":11,""start"":1},{""attributes"":null,""end"":4321,""start"":4314},{""attributes"":null,""end"":5407,""start"":5331},{""attributes"":null,""end"":5888,""start"":5820},{""attributes"":null,""end"":6216,""start"":6153},{""attributes"":null,""end"":6868,""start"":6746},{""attributes"":null,""end"":8523,""start"":8513},{""attributes"":null,""end"":16548,""start"":16527},{""attributes"":null,""end"":16559,""start"":16550},{""attributes"":null,""end"":17519,""start"":17439},{""attributes"":null,""end"":19472,""start"":19445},{""attrib...",CCBY
1,3738414,2789005638,10.1155/2018/1042479,,PROM and Labour Effects on Urinary Metabolome: A Pilot Study,https://pmc.ncbi.nlm.nih.gov/articles/PMC5817378,GOLD,"\nIntroduction\n\nThe early diagnosis of pregnancy-related complications and the prediction of pregnancy outcome are considered strategic clinical goals to ensure the health of mothers and of their babies. Among these, premature rupture of membranes (PROM) consists of the rupture of the foetal membranes before the onset of labour. It can be observed at any gestational age [1] and occurs in approximately 10% of pregnant women and in roughly 40% of preterm deliveries [2]. Foetal membranes are ...","[{""attributes"":null,""end"":4623,""start"":15},{""attributes"":null,""end"":6006,""start"":4703},{""attributes"":null,""end"":7040,""start"":6040},{""attributes"":null,""end"":7808,""start"":7063},{""attributes"":null,""end"":9464,""start"":7836},{""attributes"":null,""end"":10835,""start"":9466},{""attributes"":null,""end"":11411,""start"":10861},{""attributes"":null,""end"":12575,""start"":11413},{""attributes"":null,""end"":12831,""start"":12585},{""attributes"":null,""end"":12984,""start"":12833},{""attributes"":null,""end"":13373,""start"":12986},{""...","[{""attributes"":{""n"":""1.""},""end"":13,""start"":1},{""attributes"":{""n"":""2.""},""end"":4646,""start"":4625},{""attributes"":{""n"":""2.1.""},""end"":4701,""start"":4648},{""attributes"":{""n"":""2.2.""},""end"":6038,""start"":6008},{""attributes"":{""n"":""2.3.""},""end"":7061,""start"":7042},{""attributes"":{""n"":""2.4.""},""end"":7834,""start"":7810},{""attributes"":{""n"":""3.""},""end"":10859,""start"":10837},{""attributes"":{""n"":""3.3.""},""end"":12583,""start"":12577},{""attributes"":{""n"":""4.""},""end"":13385,""start"":13375},{""attributes"":{""n"":""5.""},""end"":167...",CCBY
2,15710447,2741402565,10.18653/v1/w17-1005,,Word Embedding and Topic Modeling Enhanced Multiple Features for Content Linking and Argument / Sentiment Labeling in Online Forums,https://aclanthology.org/W17-1005,HYBRID,"\nIntroduction\n\nComments to news and their providers in online forums have been increasing rapidly in recent years with a large number of user participants and huge amount of interactive contents. How can we understand the mass of comments effectively? A crucial initial step towards this goal should be content linking, which is to determine what comments link to, be that either specific news snippets or comments by other users. Furthermore, a set of labels for a given link may be articulat...","[{""attributes"":null,""end"":585,""start"":15},{""attributes"":null,""end"":813,""start"":587},{""attributes"":null,""end"":1141,""start"":815},{""attributes"":null,""end"":1511,""start"":1143},{""attributes"":null,""end"":2352,""start"":1531},{""attributes"":null,""end"":3126,""start"":2363},{""attributes"":null,""end"":3193,""start"":3145},{""attributes"":null,""end"":3418,""start"":3211},{""attributes"":null,""end"":3543,""start"":3452},{""attributes"":null,""end"":3672,""start"":3545},{""attributes"":null,""end"":3984,""start"":3674},{""attributes"":nul...","[{""attributes"":{""n"":""1""},""end"":13,""start"":1},{""attributes"":{""n"":""2""},""end"":1529,""start"":1513},{""attributes"":{""n"":""3""},""end"":2361,""start"":2354},{""attributes"":{""n"":""3.1""},""end"":3143,""start"":3128},{""attributes"":{""n"":""3.1.1""},""end"":3209,""start"":3195},{""attributes"":{""n"":""3.1.2""},""end"":3450,""start"":3420},{""attributes"":{""n"":""3.1.3""},""end"":5187,""start"":5154},{""attributes"":{""n"":""3.2""},""end"":6316,""start"":6302},{""attributes"":null,""end"":7038,""start"":7024},{""attributes"":null,""end"":7173,""start"":7150},{""at...",CCBY


Unnamed: 0,c
0,11609787


 :: "02_stg".base_semanticscholar_s2orcv2 queries finished | since_start: 11.0 minutes, 11.75 seconds | since_last: 10.04 seconds :: 


' :: "02_stg".base_semanticscholar_s2orcv2 queries finished | since_start: 11.0 minutes, 11.75 seconds | since_last: 10.04 seconds :: '

In [4]:
table_name = 'stg_semanticscholar_combined_works'
timelogger = utils.TimeLogger()
utils.create_table_from_sql_file(
    database_name = '02_stg',
    table_name = table_name,
    overwrite_strategy='overwrite', # options: fail, overwrite, ignore
    wait=True,
)
timelogger.log(f'"02_stg".{table_name} created')

utils.pd_set_options(cols=500)
display(wr.athena.read_sql_query(f"""SELECT * FROM "02_stg".{table_name} LIMIT 3 """, '02_stg'))
display(wr.athena.read_sql_query(f"""SELECT COUNT(*) AS c FROM "02_stg".{table_name} """, '02_stg'))
timelogger.log(f'"02_stg".{table_name} queries finished')

 :: :: TIMELOGGER STARTED :: | since_start: 0.00 seconds | since_last: 0.00 seconds :: 
Table 02_stg.stg_semanticscholar_combined_works already exists. Overwriting since overwrite_strategy=="overwrite".
Deleting table from Glue Catalog 02_stg stg_semanticscholar_combined_works
Deleting S3 objects from s3://sagemaker-research-methodology-extraction/01_data/02_stg/stg_semanticscholar_combined_works/
s3_parent_target_path:  s3://sagemaker-research-methodology-extraction/01_data/02_stg
 :: "02_stg".stg_semanticscholar_combined_works created | since_start: 13.0 minutes, 48.32 seconds | since_last: 13.0 minutes, 48.32 seconds :: 


Unnamed: 0,id_semanticscholar,id_mag,id_doi,id_arxiv,title,source_url,openaccess_status,content_text,annotations_paragraph,annotations_section_header,license,content_abstract,publication_year,publication_date
0,220531160,3042431448.0,10.1167/iovs.61.8.16,,Quantitative Fundus Autofluorescence in Rhesus Macaques in Aging and Age-Related Drusen,https://pmc.ncbi.nlm.nih.gov/articles/PMC7425688,GOLD,"\nD iseases of the macula, such as age-related macular degeneration (AMD) and diabetic macular edema, are leading causes of visual impairment in developed countries. 1 Animal models of macular conditions can further detail the mechanisms of their pathogenesis and reveal new insights into developing novel interventions. Nonhuman primates (NHPs) are a compelling animal model for studying macular diseases as they are the only mammals beside humans to possess a true macula. NHPs, such as rhesus ...","[{""attributes"":null,""end"":1340,""start"":1},{""attributes"":null,""end"":2554,""start"":1342},{""attributes"":null,""end"":4618,""start"":2580},{""attributes"":null,""end"":6008,""start"":4640},{""attributes"":null,""end"":7798,""start"":6010},{""attributes"":null,""end"":9037,""start"":7824},{""attributes"":null,""end"":10039,""start"":9078},{""attributes"":null,""end"":11377,""start"":10063},{""attributes"":null,""end"":12469,""start"":11391},{""attributes"":null,""end"":13219,""start"":12492},{""attributes"":null,""end"":13875,""start"":13257},{""att...","[{""attributes"":null,""end"":2563,""start"":2556},{""attributes"":null,""end"":2578,""start"":2565},{""attributes"":null,""end"":4638,""start"":4620},{""attributes"":null,""end"":7822,""start"":7800},{""attributes"":null,""end"":9076,""start"":9039},{""attributes"":null,""end"":10061,""start"":10041},{""attributes"":null,""end"":11389,""start"":11379},{""attributes"":null,""end"":12478,""start"":12471},{""attributes"":null,""end"":12490,""start"":12480},{""attributes"":null,""end"":13255,""start"":13221},{""attributes"":null,""end"":14316,""start"":14284}...",CCBYNCND,"Purpose To employ quantitative fundus autofluorescence (qAF) imaging in rhesus macaques to noninvasively assess retinal pigment epithelial (RPE) lipofuscin in nonhuman primates (NHPs) as a model of aging and age-related macular degeneration (AMD). Methods The qAF imaging was performed on eyes of 26 rhesus macaques (mean age 18.8 ± 8.2 years, range 4–27 years) with normal-appearing fundus or with age-related soft drusen using a confocal scanning laser ophthalmoscope with 488 nm excitation and...",2020,2020-07-01
1,268446036,,10.1007/s40670-024-02017-9,,Humanism Rounds: A Multifaceted “Back to Bedside” Initiative to Improve Meaning at Work for Internal Medicine Residents,https://pmc.ncbi.nlm.nih.gov/articles/PMC11180076,HYBRID,"\nIntroduction\n\nBurnout affects medical residents nationwide, leading to poor resident wellbeing, career dissatisfaction, and decreased quality of patient care [1,2].The rates of burnout among residents range from 27 to 75%, with high rates noted in obstetrics and gynecology (75%), internal medicine (63%), and general surgery (40%) with the lowest rate among family medicine residents (27%) [3].Research into burnout during residency has focused on a variety of contributing factors including...","[{""attributes"":null,""end"":1032,""start"":15},{""attributes"":null,""end"":1487,""start"":1034},{""attributes"":null,""end"":1960,""start"":1489},{""attributes"":null,""end"":2269,""start"":1985},{""attributes"":null,""end"":3087,""start"":2291},{""attributes"":null,""end"":3559,""start"":3118},{""attributes"":null,""end"":4961,""start"":3612},{""attributes"":null,""end"":5991,""start"":4980},{""attributes"":null,""end"":6547,""start"":5993},{""attributes"":null,""end"":7174,""start"":6571},{""attributes"":null,""end"":7594,""start"":7185},{""attributes""...","[{""attributes"":null,""end"":13,""start"":1},{""attributes"":null,""end"":1983,""start"":1962},{""attributes"":null,""end"":2289,""start"":2271},{""attributes"":null,""end"":3103,""start"":3089},{""attributes"":null,""end"":3116,""start"":3105},{""attributes"":null,""end"":3590,""start"":3561},{""attributes"":null,""end"":3610,""start"":3592},{""attributes"":null,""end"":4978,""start"":4963},{""attributes"":null,""end"":6569,""start"":6549},{""attributes"":null,""end"":7183,""start"":7176},{""attributes"":null,""end"":7641,""start"":7596},{""attributes"":nu...",CCBY,"Introduction Burnout is an increasingly prevalent problem among resident physicians. To address this problem, the Accreditation Council on Graduate Medical Education (ACGME) created the Back to Bedside initiative, supporting resident-driven projects focused on increasing direct interactions with patients. In 2017, Baylor College of Medicine (BCM) Internal Medicine Residency received a Back to Bedside grant to develop and implement “Humanism Rounds,” a multifaceted program which sought to pro...",2024,2024-03-13
2,249401160,,10.1007/s12471-022-01700-z,,Major adverse cardiovascular events in older emergency department patients presenting with non-cardiac medical complaints,https://pmc.ncbi.nlm.nih.gov/articles/PMC9691805,GOLD,"\nIntroduction\n\nOlder patients are at high risk of adverse outcomes after an emergency department (ED) visit [1,2]. However, the risk of major adverse cardiovascular events (MACE) for older ED patients, presenting with noncardiac medical complaints, is unknown. Because preventive measures may improve outcome [3], early identification of patients at risk is highly important. \n\nBesides conventional cardiovascular risk factors, the cardiac biomarkers high-sensitivity cardiac Troponin T (hs-...","[{""attributes"":null,""end"":376,""start"":15},{""attributes"":null,""end"":783,""start"":378},{""attributes"":null,""end"":1290,""start"":785},{""attributes"":null,""end"":2491,""start"":1337},{""attributes"":null,""end"":2771,""start"":2493},{""attributes"":null,""end"":3143,""start"":2796},{""attributes"":null,""end"":4944,""start"":3162},{""attributes"":null,""end"":5900,""start"":4946},{""attributes"":null,""end"":6149,""start"":5902},{""attributes"":null,""end"":6368,""start"":6169},{""attributes"":null,""end"":6672,""start"":6385},{""attributes"":nul...","[{""attributes"":null,""end"":13,""start"":1},{""attributes"":null,""end"":1299,""start"":1292},{""attributes"":null,""end"":1335,""start"":1301},{""attributes"":null,""end"":2794,""start"":2773},{""attributes"":null,""end"":3160,""start"":3145},{""attributes"":null,""end"":6167,""start"":6151},{""attributes"":null,""end"":6383,""start"":6370},{""attributes"":null,""end"":8057,""start"":8050},{""attributes"":null,""end"":8103,""start"":8059},{""attributes"":null,""end"":9631,""start"":9566},{""attributes"":null,""end"":10054,""start"":10031},{""attributes"":...",CCBY,"The risk of major adverse cardiovascular events (MACE) for older emergency department (ED) patients presenting with non-cardiac medical complaints is unknown. To apply preventive measures timely, early identification of high-risk patients is incredibly important. We aimed at investigating the incidence of MACE within one year after their ED visit and the predictive value of high-sensitivity cardiac troponin T (hs-cTnT) and N‑terminal pro-B-type natriuretic peptide (NT-proBNP) for subsequent ...",2022,2022-06-07


Unnamed: 0,c
0,11609787


 :: "02_stg".stg_semanticscholar_combined_works queries finished | since_start: 13.0 minutes, 58.61 seconds | since_last: 10.29 seconds :: 


' :: "02_stg".stg_semanticscholar_combined_works queries finished | since_start: 13.0 minutes, 58.61 seconds | since_last: 10.29 seconds :: '

In [18]:
table_name = 'stg_semanticscholar_combined_works_content'
timelogger = utils.TimeLogger()
utils.create_table_from_sql_file(
    database_name = '02_stg',
    table_name = table_name,
    overwrite_strategy='overwrite', # options: fail, overwrite, ignore
    wait=True,
)
timelogger.log(f'"02_stg".{table_name} created')

utils.pd_set_options(cols=300)
display(wr.athena.read_sql_query(f"""SELECT * FROM "02_stg".{table_name} LIMIT 3 """, '02_stg'))
display(wr.athena.read_sql_query(f"""SELECT COUNT(*) AS c FROM "02_stg".{table_name} """, '02_stg'))
timelogger.log(f'"02_stg".{table_name} queries finished')

 :: :: TIMELOGGER STARTED :: | since_start: 0.00 seconds | since_last: 0.00 seconds :: 
Deleting S3 objects from s3://sagemaker-research-methodology-extraction/01_data/02_stg/stg_semanticscholar_combined_works_content/
s3_parent_target_path:  s3://sagemaker-research-methodology-extraction/01_data/02_stg
 :: "02_stg".stg_semanticscholar_combined_works_content created | since_start: 11.0 minutes, 20.68 seconds | since_last: 11.0 minutes, 20.68 seconds :: 


Unnamed: 0,id_semanticscholar,title,content_abstract,content_text,annotations_paragraph,annotations_section_header
0,260163299,Screening depression and anxiety in Indigenous peoples: A global scoping review,"Indigenous peoples’ worldviews are intricately interconnected and interrelated with their communities and the environments in which they live. Their worldviews also manifest in a holistic view of health and well-being, which contrasts with those of the dominant western biomedical model. However,...","\nIntroduction\n\nThe worldviews of Indigenous peoples are intricately interrelated and interconnected with those of their communities and the environments in which they live. Indigenous people conceptualise health and well-being more holistically (Gall et al., 2021) than the dominant western bi...","[{""attributes"":null,""end"":512,""start"":15},{""attributes"":null,""end"":1314,""start"":514},{""attributes"":null,""end"":2095,""start"":1316},{""attributes"":null,""end"":3698,""start"":2097},{""attributes"":null,""end"":4904,""start"":3700},{""attributes"":null,""end"":5652,""start"":4906},{""attributes"":null,""end"":6585,""star...","[{""attributes"":null,""end"":13,""start"":1},{""attributes"":null,""end"":5660,""start"":5654},{""attributes"":null,""end"":7233,""start"":7215},{""attributes"":null,""end"":11222,""start"":11207},{""attributes"":null,""end"":12489,""start"":12475},{""attributes"":null,""end"":13587,""start"":13572},{""attributes"":null,""end"":14789..."
1,112601881,Model Development of a Blast Furnace Stove,,\nIntroduction\n\nAbout one third of the world primary energy consumption is from the manufacturing industries. The iron and steel industry (ISI) is the second largest energy user and accounts for 20 % of the energy usage by the manufacturing industries [1]. Due to heavy reliance on fossil fuels...,"[{""attributes"":null,""end"":519,""start"":15},{""attributes"":null,""end"":1303,""start"":521},{""attributes"":null,""end"":1735,""start"":1305},{""attributes"":null,""end"":2270,""start"":1737},{""attributes"":null,""end"":2995,""start"":2272},{""attributes"":null,""end"":3406,""start"":2997},{""attributes"":null,""end"":3691,""star...","[{""attributes"":{""n"":""1.""},""end"":13,""start"":1},{""attributes"":{""n"":""2.""},""end"":3414,""start"":3408},{""attributes"":null,""end"":4208,""start"":4172},{""attributes"":{""n"":""2.1.""},""end"":5536,""start"":5527},{""attributes"":{""n"":""2.2.""},""end"":7243,""start"":7228},{""attributes"":{""n"":""2.3.""},""end"":7941,""start"":7909},..."
2,118573517,Double beta decay transition mechanism,"After briefly reviewing $\beta \beta$ decay as a test of the neutrino mass, I examine the nuclear structure involved in this process. Simple formulas (\`{a} la Pad\'{e}) are designed for the transition amplitudes and the general behavior of $\beta \beta$ decay amplitudes in the quasiparticle ran...",\nIntroduction\n\nThe double beta (ββ) decay is a nice example of the interrelation between the Particle Physics and the Nuclear Physics: we can get information on the properties of the neutrino and the weak interaction from the ββ decay only if we know who to deal we the nuclear structure invol...,"[{""attributes"":null,""end"":582,""start"":15},{""attributes"":null,""end"":1069,""start"":584},{""attributes"":null,""end"":1224,""start"":1071},{""attributes"":null,""end"":1310,""start"":1226},{""attributes"":null,""end"":1343,""start"":1312},{""attributes"":null,""end"":1420,""start"":1345},{""attributes"":null,""end"":1839,""star...","[{""attributes"":{""n"":""1.""},""end"":13,""start"":1},{""attributes"":{""n"":""3.""},""end"":10044,""start"":9997},{""attributes"":{""n"":""4.""},""end"":14193,""start"":14165}]"


Unnamed: 0,c
0,11609787


 :: "02_stg".stg_semanticscholar_combined_works_content queries finished | since_start: 11.0 minutes, 31.19 seconds | since_last: 10.51 seconds :: 


' :: "02_stg".stg_semanticscholar_combined_works_content queries finished | since_start: 11.0 minutes, 31.19 seconds | since_last: 10.51 seconds :: '

In [19]:
table_name = 'stg_semanticscholar_combined_works_metadata'
timelogger = utils.TimeLogger()
utils.create_table_from_sql_file(
    database_name = '02_stg',
    table_name = table_name,
    overwrite_strategy='overwrite', # options: fail, overwrite, ignore
    wait=True,
)
timelogger.log(f'"02_stg".{table_name} created')

utils.pd_set_options(cols=300)
display(wr.athena.read_sql_query(f"""SELECT * FROM "02_stg".{table_name} LIMIT 3 """, '02_stg'))
display(wr.athena.read_sql_query(f"""SELECT COUNT(*) AS c FROM "02_stg".{table_name} """, '02_stg'))
timelogger.log(f'"02_stg".{table_name} queries finished')

 :: :: TIMELOGGER STARTED :: | since_start: 0.00 seconds | since_last: 0.00 seconds :: 
Deleting S3 objects from s3://sagemaker-research-methodology-extraction/01_data/02_stg/stg_semanticscholar_combined_works_metadata/
s3_parent_target_path:  s3://sagemaker-research-methodology-extraction/01_data/02_stg
 :: "02_stg".stg_semanticscholar_combined_works_metadata created | since_start: 5.85 seconds | since_last: 5.85 seconds :: 


Unnamed: 0,id_semanticscholar,id_mag,id_doi,id_arxiv,source_url,openaccess_status,license,publication_year,publication_date
0,46791531,2755160340.0,10.1364/oe.25.023899,,https://doi.org/10.1364/OE.25.023899,GOLD,CCBY,2017,2017-10-02
1,13140185,2010414254.0,10.3390/rs6031863,,https://doi.org/10.3390/rs6031863,GOLD,CCBY,2014,2014-02-28
2,253366618,,10.1016/j.xpro.2022.101803,,https://pmc.ncbi.nlm.nih.gov/articles/PMC9641055,GOLD,CCBYNCND,2022,2022-11-04


Unnamed: 0,c
0,11609787


 :: "02_stg".stg_semanticscholar_combined_works_metadata queries finished | since_start: 15.13 seconds | since_last: 9.28 seconds :: 


' :: "02_stg".stg_semanticscholar_combined_works_metadata queries finished | since_start: 15.13 seconds | since_last: 9.28 seconds :: '

In [5]:
table_name = 'base_arxiv_metadata'
timelogger = utils.TimeLogger()
utils.create_table_from_sql_file(
    database_name = '02_stg',
    table_name = table_name,
    overwrite_strategy='overwrite', # options: fail, overwrite, ignore
    wait=True,
)
timelogger.log(f'"02_stg".{table_name} created')

utils.pd_set_options(cols=500)
display(wr.athena.read_sql_query(f"""SELECT * FROM "02_stg".{table_name} LIMIT 3 """, '02_stg'))
display(wr.athena.read_sql_query(f"""SELECT COUNT(*) AS c FROM "02_stg".{table_name} """, '02_stg'))
timelogger.log(f'"02_stg".{table_name} queries finished')

 :: :: TIMELOGGER STARTED :: | since_start: 0.00 seconds | since_last: 0.00 seconds :: 
Table 02_stg.base_arxiv_metadata already exists. Overwriting since overwrite_strategy=="overwrite".
Deleting table from Glue Catalog 02_stg base_arxiv_metadata
Deleting S3 objects from s3://sagemaker-research-methodology-extraction/01_data/02_stg/base_arxiv_metadata/
s3_parent_target_path:  s3://sagemaker-research-methodology-extraction/01_data/02_stg
 :: "02_stg".base_arxiv_metadata created | since_start: 15.44 seconds | since_last: 15.44 seconds :: 


Unnamed: 0,id_arxiv,id_doi,title,abstract,license
0,903.1601,,Parabolic-Dish Solar Concentrators of Film on Foam,"Parabolic and spherical mirrors are constructed of aluminized PET polyester film on urethane foam. During construction, the chosen shape of the mirror is created by manipulating the elastic/plastic behavior of the film with air pressure. Foam is then applied to the film and, once hardened, air pressure is removed. At an f-number of 0.68, preliminary models have an optical angular spread of less than 0.25 degrees, a factor of 3.3 smaller than that for a perfectly spherical mirror. The possi...",ArXiv nonexclusive-distrib
1,903.1604,10.3842/SIGMA.2009.029,Limits of Gaudin Systems: Classical and Quantum Cases,"We consider the XXX homogeneous Gaudin system with $N$ sites, both in classical and the quantum case. In particular we show that a suitable limiting procedure for letting the poles of its Lax matrix collide can be used to define new families of Liouville integrals (in the classical case) and new ""Gaudin"" algebras (in the quantum case). We will especially treat the case of total collisions, that gives rise to (a generalization of) the so called Bending flows of Kapovich and Millson. Some as...",CCBYNCSA
2,903.16,,Typically Real Harmonic Functions,We consider a class $\THO$ of typically real harmonic functions on the unit disk that contains the class of normalized analytic and typically real functions. We also obtain some partial results about the region of univalence for this class.,ArXiv nonexclusive-distrib


Unnamed: 0,c
0,2816721


 :: "02_stg".base_arxiv_metadata queries finished | since_start: 24.78 seconds | since_last: 9.35 seconds :: 


' :: "02_stg".base_arxiv_metadata queries finished | since_start: 24.78 seconds | since_last: 9.35 seconds :: '

In [21]:
table_name = 'stg_unified_works_metadata_01_joined_to_arxiv'
timelogger = utils.TimeLogger()
utils.create_table_from_sql_file(
    database_name = '02_stg',
    table_name = table_name,
    overwrite_strategy='overwrite', # options: fail, overwrite, ignore
    wait=True,
)
timelogger.log(f'"02_stg".{table_name} created')

utils.pd_set_options(cols=300)
display(wr.athena.read_sql_query(f"""SELECT * FROM "02_stg".{table_name} LIMIT 3 """, '02_stg'))
display(wr.athena.read_sql_query(f"""SELECT COUNT(*) AS c FROM "02_stg".{table_name} """, '02_stg'))
timelogger.log(f'"02_stg".{table_name} queries finished')

 :: :: TIMELOGGER STARTED :: | since_start: 0.00 seconds | since_last: 0.00 seconds :: 
Deleting S3 objects from s3://sagemaker-research-methodology-extraction/01_data/02_stg/stg_unified_works_metadata_01_joined_to_arxiv/
s3_parent_target_path:  s3://sagemaker-research-methodology-extraction/01_data/02_stg
 :: "02_stg".stg_unified_works_metadata_01_joined_to_arxiv created | since_start: 11.79 seconds | since_last: 11.79 seconds :: 


Unnamed: 0,id_semanticscholar,id_mag,id_doi,id_arxiv,source_url,openaccess_status,publication_year,publication_date,license,license_allows_derivative_reuse
0,4951032,2789407911,10.1016/j.tecto.2018.03.010,,https://doi.org/10.1016/J.TECTO.2018.03.010,HYBRID,2018,2018-04-22,CCBY,1
1,204923380,2981732074,10.1016/j.jalz.2019.08.201,,https://pmc.ncbi.nlm.nih.gov/articles/PMC7012375,HYBRID,2019,2019-10-28,CCBYNCND,0
2,14519185,2114693455,10.1099/vir.0.007377-0,,https://pmc.ncbi.nlm.nih.gov/articles/PMC2885064,HYBRID,2009,2009-03-01,CCBY,1


Unnamed: 0,c
0,11609787


 :: "02_stg".stg_unified_works_metadata_01_joined_to_arxiv queries finished | since_start: 21.14 seconds | since_last: 9.35 seconds :: 


' :: "02_stg".stg_unified_works_metadata_01_joined_to_arxiv queries finished | since_start: 21.14 seconds | since_last: 9.35 seconds :: '

In [2]:
table_name = 'stg_unified_works_metadata_02_joined_to_openalex'
timelogger = utils.TimeLogger()
utils.create_table_from_sql_file(
    database_name = '02_stg',
    table_name = table_name,
    overwrite_strategy='overwrite', # options: fail, overwrite, ignore
    wait=True,
)
timelogger.log(f'"02_stg".{table_name} created')

utils.pd_set_options(cols=300)
display(wr.athena.read_sql_query(f"""SELECT * FROM "02_stg".{table_name} LIMIT 3 """, '02_stg'))
display(wr.athena.read_sql_query(f"""SELECT COUNT(*) AS c FROM "02_stg".{table_name} """, '02_stg'))
timelogger.log(f'"02_stg".{table_name} queries finished')

 :: :: TIMELOGGER STARTED :: | since_start: 0.00 seconds | since_last: 0.00 seconds :: 
Table 02_stg.stg_unified_works_metadata_02_joined_to_openalex already exists. Overwriting since overwrite_strategy=="overwrite".
Deleting table from Glue Catalog 02_stg stg_unified_works_metadata_02_joined_to_openalex
Deleting S3 objects from s3://sagemaker-research-methodology-extraction/01_data/02_stg/stg_unified_works_metadata_02_joined_to_openalex/
s3_parent_target_path:  s3://sagemaker-research-methodology-extraction/01_data/02_stg
 :: "02_stg".stg_unified_works_metadata_02_joined_to_openalex created | since_start: 10.35 seconds | since_last: 10.35 seconds :: 


Unnamed: 0,id_semanticscholar,id_mag,id_doi,id_arxiv,publication_year,publication_date,license,license_allows_derivative_reuse,source_url,has_id_mag,has_id_doi,has_id_mag_or_doi,openalex_id_openalex,openalex_id_doi,openalex_language,openalex_primary_topic_id,openalex_primary_topic_display_name,openalex_primary_topic_subfield_id,openalex_primary_topic_subfield_display_name,openalex_primary_topic_field_id,openalex_primary_topic_field_display_name,openalex_primary_topic_domain_id,openalex_primary_topic_domain_display_name,openalex_joined_on
0,235199166,,10.2196/23099,,2020,2020-07-31,CCBY,1,https://pmc.ncbi.nlm.nih.gov/articles/PMC8190645,0,1,1,3165206559,10.2196/23099,en,10028,Topic Modeling,1702,Artificial Intelligence,17,Computer Science,3,Physical Sciences,doi
1,259039518,,10.1109/lwc.2023.3281881,2306.0538,2023,2023-05-24,ArXiv nonexclusive-distrib,0,https://arxiv.org/abs/2306.05380,0,1,1,4379033818,10.1109/lwc.2023.3281881,en,10764,Privacy-Preserving Technologies in Data,1702,Artificial Intelligence,17,Computer Science,3,Physical Sciences,doi
2,271769360,,10.18653/v1/2024.wassa-1.43,,2024,,unknown-reusability,0,https://aclanthology.org/2024.wassa-1.43,0,1,1,4402670554,10.18653/v1/2024.wassa-1.43,en,10028,Topic Modeling,1702,Artificial Intelligence,17,Computer Science,3,Physical Sciences,doi


Unnamed: 0,c
0,834072


 :: "02_stg".stg_unified_works_metadata_02_joined_to_openalex queries finished | since_start: 18.42 seconds | since_last: 8.07 seconds :: 


' :: "02_stg".stg_unified_works_metadata_02_joined_to_openalex queries finished | since_start: 18.42 seconds | since_last: 8.07 seconds :: '

In [22]:
table_name = 'stg_unified_works_metadata_03_filtered_and_tagged'
db_name = '02_stg'
timelogger = utils.TimeLogger()
utils.create_table_from_sql_file(
    database_name = db_name,
    table_name = table_name,
    overwrite_strategy='overwrite', # options: fail, overwrite, ignore
    wait=True,
)
timelogger.log(f'"{db_name}".{table_name} created')

utils.pd_set_options(cols=300)
display(wr.athena.read_sql_query(f"""SELECT * FROM "{db_name}".{table_name} LIMIT 3 """, db_name))
display(wr.athena.read_sql_query(f"""SELECT COUNT(*) AS c FROM "{db_name}".{table_name} """, db_name))
timelogger.log(f'"{db_name}".{table_name} queries finished')

 :: :: TIMELOGGER STARTED :: | since_start: 0.00 seconds | since_last: 0.00 seconds :: 
Table 02_stg.stg_unified_works_metadata_03_filtered_and_tagged already exists. Overwriting since overwrite_strategy=="overwrite".
Deleting table from Glue Catalog 02_stg stg_unified_works_metadata_03_filtered_and_tagged
Deleting S3 objects from s3://sagemaker-research-methodology-extraction/01_data/02_stg/stg_unified_works_metadata_03_filtered_and_tagged/
s3_parent_target_path:  s3://sagemaker-research-methodology-extraction/01_data/02_stg
 :: "02_stg".stg_unified_works_metadata_03_filtered_and_tagged created | since_start: 6.76 seconds | since_last: 6.76 seconds :: 


Unnamed: 0,id_semanticscholar,id_mag,id_doi,id_arxiv,publication_year,publication_date,license,license_allows_derivative_reuse,source_url,has_id_mag,has_id_doi,has_id_mag_or_doi,openalex_id_openalex,openalex_id_doi,openalex_language,openalex_primary_topic_id,openalex_primary_topic_display_name,openalex_primary_topic_subfield_id,openalex_primary_topic_subfield_display_name,openalex_primary_topic_field_id,openalex_primary_topic_field_display_name,openalex_primary_topic_domain_id,openalex_primary_topic_domain_display_name,openalex_joined_on,bucket_10p,subset
0,218571297,3023155397.0,,2005.04094,2020,2020-05-05,CCBY,1,https://arxiv.org/abs/2005.04094,1,0,1,3023155397,10.48550/arxiv.2005.04094,en,10054,Parallel Computing and Optimization Techniques,1708,Hardware and Architecture,17,Computer Science,3,Physical Sciences,mag,1,test
1,208020257,3101804962.0,10.1007/s10766-019-00646-x,1911.08779,2019,2019-11-15,CCBY,1,https://arxiv.org/abs/1911.08779,1,1,1,3101804962,10.1007/s10766-019-00646-x,en,10054,Parallel Computing and Optimization Techniques,1708,Hardware and Architecture,17,Computer Science,3,Physical Sciences,doi,1,test
2,259736562,,10.1080/19942060.2023.2210196,,2023,2023-06-28,CCBY,1,https://doi.org/10.1080/19942060.2023.2210196,0,1,1,4382395626,10.1080/19942060.2023.2210196,en,10054,Parallel Computing and Optimization Techniques,1708,Hardware and Architecture,17,Computer Science,3,Physical Sciences,doi,1,test


Unnamed: 0,c
0,396430


 :: "02_stg".stg_unified_works_metadata_03_filtered_and_tagged queries finished | since_start: 14.27 seconds | since_last: 7.51 seconds :: 


' :: "02_stg".stg_unified_works_metadata_03_filtered_and_tagged queries finished | since_start: 14.27 seconds | since_last: 7.51 seconds :: '

In [2]:
table_name = 'stg_topics'
db_name = '02_stg'
timelogger = utils.TimeLogger()
utils.create_table_from_sql_file(
    database_name = db_name,
    table_name = table_name,
    overwrite_strategy='overwrite', # options: fail, overwrite, ignore
    wait=True,
)
timelogger.log(f'"{db_name}".{table_name} created')

utils.pd_set_options(cols=300)
display(wr.athena.read_sql_query(f"""SELECT * FROM "{db_name}".{table_name} ORDER BY openalex_primary_topic_count DESC LIMIT 10 """, db_name))
display(wr.athena.read_sql_query(f"""SELECT COUNT(*) AS c FROM "{db_name}".{table_name} """, db_name))
timelogger.log(f'"{db_name}".{table_name} queries finished')

 :: :: TIMELOGGER STARTED :: | since_start: 0.00 seconds | since_last: 0.00 seconds :: 
Table 02_stg.stg_topics already exists. Overwriting since overwrite_strategy=="overwrite".
Deleting table from Glue Catalog 02_stg stg_topics
Deleting S3 objects from s3://sagemaker-research-methodology-extraction/01_data/02_stg/stg_topics/
s3_parent_target_path:  s3://sagemaker-research-methodology-extraction/01_data/02_stg
 :: "02_stg".stg_topics created | since_start: 6.23 seconds | since_last: 6.23 seconds :: 


Unnamed: 0,openalex_primary_topic_index,openalex_primary_topic_id,openalex_primary_topic_display_name,openalex_primary_topic_count,openalex_primary_topic_percent,openalex_primary_topic_subfield_id,openalex_primary_topic_subfield_display_name
0,6,10036,Advanced Neural Network Applications,6193,1.562193,1707,Computer Vision and Pattern Recognition


Unnamed: 0,c
0,1


 :: "02_stg".stg_topics queries finished | since_start: 13.85 seconds | since_last: 7.62 seconds :: 


' :: "02_stg".stg_topics queries finished | since_start: 13.85 seconds | since_last: 7.62 seconds :: '

In [3]:
table_name = 'stg_subfields'
db_name = '02_stg'
timelogger = utils.TimeLogger()
utils.create_table_from_sql_file(
    database_name = db_name,
    table_name = table_name,
    overwrite_strategy='overwrite', # options: fail, overwrite, ignore
    wait=True,
)
timelogger.log(f'"{db_name}".{table_name} created')

utils.pd_set_options(cols=300)
display(wr.athena.read_sql_query(f"""SELECT * FROM "{db_name}".{table_name} ORDER BY openalex_primary_topic_subfield_count DESC LIMIT 300 """, db_name))
display(wr.athena.read_sql_query(f"""SELECT COUNT(*) AS c FROM "{db_name}".{table_name} """, db_name))
timelogger.log(f'"{db_name}".{table_name} queries finished')

 :: :: TIMELOGGER STARTED :: | since_start: 0.00 seconds | since_last: 0.00 seconds :: 
Table 02_stg.stg_subfields already exists. Overwriting since overwrite_strategy=="overwrite".
Deleting table from Glue Catalog 02_stg stg_subfields
Deleting S3 objects from s3://sagemaker-research-methodology-extraction/01_data/02_stg/stg_subfields/
s3_parent_target_path:  s3://sagemaker-research-methodology-extraction/01_data/02_stg
 :: "02_stg".stg_subfields created | since_start: 5.89 seconds | since_last: 5.89 seconds :: 


Unnamed: 0,openalex_primary_topic_subfield_index,openalex_primary_topic_subfield_id,openalex_primary_topic_subfield_display_name,openalex_primary_topic_subfield_count,openalex_primary_topic_subfield_percent
0,0,1707,Computer Vision and Pattern Recognition,6193,100.0


Unnamed: 0,c
0,1


 :: "02_stg".stg_subfields queries finished | since_start: 15.96 seconds | since_last: 10.07 seconds :: 


' :: "02_stg".stg_subfields queries finished | since_start: 15.96 seconds | since_last: 10.07 seconds :: '

In [4]:
table_name = 'stg_unified_works_metadata_04_with_topics_and_subfields'
db_name = '02_stg'
timelogger = utils.TimeLogger()
utils.create_table_from_sql_file(
    database_name = db_name,
    table_name = table_name,
    overwrite_strategy='overwrite', # options: fail, overwrite, ignore
    wait=True,
)
timelogger.log(f'"{db_name}".{table_name} created')

utils.pd_set_options(cols=300)
display(wr.athena.read_sql_query(f"""SELECT * FROM "{db_name}".{table_name} ORDER BY RANDOM() LIMIT 3 """, db_name))
display(wr.athena.read_sql_query(f"""SELECT COUNT(*) AS c FROM "{db_name}".{table_name} """, db_name))
timelogger.log(f'"{db_name}".{table_name} queries finished')

 :: :: TIMELOGGER STARTED :: | since_start: 0.00 seconds | since_last: 0.00 seconds :: 
Table 02_stg.stg_unified_works_metadata_04_with_topics_and_subfields already exists. Overwriting since overwrite_strategy=="overwrite".
Deleting table from Glue Catalog 02_stg stg_unified_works_metadata_04_with_topics_and_subfields
Deleting S3 objects from s3://sagemaker-research-methodology-extraction/01_data/02_stg/stg_unified_works_metadata_04_with_topics_and_subfields/
s3_parent_target_path:  s3://sagemaker-research-methodology-extraction/01_data/02_stg
 :: "02_stg".stg_unified_works_metadata_04_with_topics_and_subfields created | since_start: 8.12 seconds | since_last: 8.12 seconds :: 


Unnamed: 0,id_semanticscholar,id_mag,id_doi,id_arxiv,publication_year,publication_date,license,license_allows_derivative_reuse,source_url,has_id_mag,has_id_doi,has_id_mag_or_doi,openalex_id_openalex,openalex_id_doi,openalex_language,openalex_primary_topic_id,openalex_primary_topic_display_name,openalex_primary_topic_count,openalex_primary_topic_index,openalex_primary_topic_subfield_id,openalex_primary_topic_subfield_display_name,openalex_primary_topic_subfield_count,openalex_primary_topic_subfield_index,openalex_primary_topic_field_id,openalex_primary_topic_field_display_name,openalex_primary_topic_domain_id,openalex_primary_topic_domain_display_name,openalex_joined_on,subset
0,257913122,,10.1109/cvpr52729.2023.00654,2304.00779,2023,2023-04-03,CCBY,1,https://arxiv.org/abs/2304.00779,0,1,1,4386065433,10.1109/cvpr52729.2023.00654,en,11714,Multimodal Machine Learning Applications,,,1707,Computer Vision and Pattern Recognition,6193.0,0.0,17,Computer Science,3,Physical Sciences,doi,train
1,269146337,,10.1038/s41598-024-59218-w,,2024,2024-04-14,CCBY,1,https://pmc.ncbi.nlm.nih.gov/articles/PMC11385191,0,1,1,4394794241,10.1038/s41598-024-59218-w,en,10812,Human Pose and Action Recognition,,,1707,Computer Vision and Pattern Recognition,6193.0,0.0,17,Computer Science,3,Physical Sciences,doi,train
2,104293068,2762581681.0,10.1080/03610918.2017.1390126,,2019,2019-02-07,CCBY,1,https://doi.org/10.1080/03610918.2017.1390126,1,1,1,2762581681,10.1080/03610918.2017.1390126,en,13748,Advanced Statistical Modeling Techniques,,,1705,Computer Networks and Communications,,,17,Computer Science,3,Physical Sciences,doi,train


Unnamed: 0,c
0,396430


 :: "02_stg".stg_unified_works_metadata_04_with_topics_and_subfields queries finished | since_start: 17.05 seconds | since_last: 8.93 seconds :: 


' :: "02_stg".stg_unified_works_metadata_04_with_topics_and_subfields queries finished | since_start: 17.05 seconds | since_last: 8.93 seconds :: '

In [5]:
table_name = 'stg_unified_works_filtered'
timelogger = utils.TimeLogger()
utils.create_table_from_sql_file(
    database_name = '02_stg',
    table_name = table_name,
    overwrite_strategy='overwrite', # options: fail, overwrite, ignore
    wait=True,
)
timelogger.log(f'"02_stg".{table_name} created')

utils.pd_set_options(cols=300)
display(wr.athena.read_sql_query(f"""SELECT * FROM "02_stg".{table_name} LIMIT 3 """, '02_stg'))
display(wr.athena.read_sql_query(f"""SELECT COUNT(*) AS c FROM "02_stg".{table_name} """, '02_stg'))
timelogger.log(f'"02_stg".{table_name} queries finished')

 :: :: TIMELOGGER STARTED :: | since_start: 0.00 seconds | since_last: 0.00 seconds :: 
Table 02_stg.stg_unified_works_filtered already exists. Overwriting since overwrite_strategy=="overwrite".
Deleting table from Glue Catalog 02_stg stg_unified_works_filtered
Deleting S3 objects from s3://sagemaker-research-methodology-extraction/01_data/02_stg/stg_unified_works_filtered/
s3_parent_target_path:  s3://sagemaker-research-methodology-extraction/01_data/02_stg
 :: "02_stg".stg_unified_works_filtered created | since_start: 1.0 minute, 29.43 seconds | since_last: 1.0 minute, 29.43 seconds :: 


Unnamed: 0,id_semanticscholar,id_mag,id_doi,id_arxiv,publication_year,publication_date,license,license_allows_derivative_reuse,source_url,has_id_mag,has_id_doi,has_id_mag_or_doi,openalex_id_openalex,openalex_id_doi,openalex_language,openalex_primary_topic_id,openalex_primary_topic_display_name,openalex_primary_topic_count,openalex_primary_topic_index,openalex_primary_topic_subfield_id,openalex_primary_topic_subfield_display_name,openalex_primary_topic_subfield_count,openalex_primary_topic_subfield_index,openalex_primary_topic_field_id,openalex_primary_topic_field_display_name,openalex_primary_topic_domain_id,openalex_primary_topic_domain_display_name,openalex_joined_on,title,content_abstract,content_text,annotations_paragraph,annotations_section_header,subset
0,234991791,3116647484.0,10.1051/e3sconf/202022401006,,2020,,CCBY,1,https://doi.org/10.1051/e3sconf/202022401006,1,1,1,3116647484,10.1051/e3sconf/202022401006,en,13935,Mathematical Control Systems and Analysis,,,1702,Artificial Intelligence,,,17,Computer Science,3,Physical Sciences,doi,Modal synthesis of precision control systems,The problem of synthesis of precision modal control systems is considered. It is noted that a common approach to solving this problem is to consistently meet the requirements for the nature of the transient process and for the indicators of its accuracy. This approach to synthesis is faced with ...,"\nIntroduction\n\nThe principle of modal control, due to the most complete reflection of the cybernetic essence of state feedback, is widely used to control lightly damped, structurally unstable and nonstationary objects [1][2][3][4][5][6][7][8][9][10]. \n\nThe problem of forming the spectrum of...","[{""attributes"":null,""end"":251,""start"":15},{""attributes"":null,""end"":721,""start"":253},{""attributes"":null,""end"":1466,""start"":723},{""attributes"":null,""end"":1862,""start"":1468},{""attributes"":null,""end"":1923,""start"":1864},{""attributes"":null,""end"":2025,""start"":1943},{""attributes"":null,""end"":2189,""start""...","[{""attributes"":{""n"":""1""},""end"":13,""start"":1},{""attributes"":{""n"":""2.1.""},""end"":1941,""start"":1925},{""attributes"":{""n"":""2.2.""},""end"":5319,""start"":5274},{""attributes"":{""n"":""3.1.""},""end"":6871,""start"":6855},{""attributes"":{""n"":""3.2.""},""end"":12033,""start"":11988},{""attributes"":null,""end"":13631,""start"":13...",train
1,265408457,,10.3390/axioms12121074,,2023,2023-11-23,CCBY,1,https://doi.org/10.3390/axioms12121074,0,1,1,4388939305,10.3390/axioms12121074,en,13983,Cybersecurity and Information Systems,,,1705,Computer Networks and Communications,,,17,Computer Science,3,Physical Sciences,doi,Optimizing Energy Conversion in a Piezo Disk Using a Controlled Supply of Electrical Load,"Piezoceramic products are actively used in modern technical devices and appliances. Disk piezoelectric devices are widely used in elements of information systems: in wireless communication, elements of satellite communication, global positioning systems. Among such devices, microwave piezo motor...","\nIntroduction\n\nDifferent methods and approaches are often used when setting up and solving applied natural or socio-economic problems. Many of these methods are based on a pre-defined set of axioms and laws. Based on this set, a mathematical model of the applied problem and methods of its sol...","[{""attributes"":null,""end"":905,""start"":15},{""attributes"":null,""end"":1595,""start"":907},{""attributes"":null,""end"":1858,""start"":1597},{""attributes"":null,""end"":2149,""start"":1860},{""attributes"":null,""end"":2466,""start"":2151},{""attributes"":null,""end"":2971,""start"":2468},{""attributes"":null,""end"":3541,""star...","[{""attributes"":{""n"":""1.""},""end"":13,""start"":1},{""attributes"":{""n"":""2.""},""end"":9368,""start"":9304},{""attributes"":{""n"":""3.""},""end"":15236,""start"":15179},{""attributes"":{""n"":""4.""},""end"":19502,""start"":19471}]",test
2,226211159,3011044277.0,10.20998/2522-9052.2020.1.10,,2020,2020-03-14,CCBYNC,1,https://doi.org/10.20998/2522-9052.2020.1.10,1,1,1,3011044277,10.20998/2522-9052.2020.1.10,en,13983,Cybersecurity and Information Systems,,,1705,Computer Networks and Communications,,,17,Computer Science,3,Physical Sciences,doi,IMPLEMENTATION OF THE ARITHMETIC ADDITION OPERATION IN THE SYSTEM OF RESIDUAL CLASSES,The subject of the article is the development of a method for implementing the arithmetic operation of addition numbers that are represented in the system of residual classes (SRC). This method is based on the use of the principle of circular shift (PCS). The purpose of the article is to reduce ...,"\nIntroduction\n\nIn positional binary number system (PBNS), the execution of the arithmetical addition operation presumes the sequential processing of digit positions of numbers by the rules determined by the content of this operation [1][2][3]. The processing continues until the values of all ...","[{""attributes"":null,""end"":1144,""start"":15},{""attributes"":null,""end"":1641,""start"":1165},{""attributes"":null,""end"":1690,""start"":1643},{""attributes"":null,""end"":1812,""start"":1692},{""attributes"":null,""end"":1964,""start"":1814},{""attributes"":null,""end"":2029,""start"":1966},{""attributes"":null,""end"":2179,""st...","[{""attributes"":null,""end"":13,""start"":1},{""attributes"":null,""end"":1163,""start"":1146},{""attributes"":null,""end"":2263,""start"":2181},{""attributes"":null,""end"":4778,""start"":4709},{""attributes"":null,""end"":6523,""start"":6512},{""attributes"":null,""end"":8459,""start"":8389},{""attributes"":null,""end"":10604,""star...",validation


Unnamed: 0,c
0,396430


 :: "02_stg".stg_unified_works_filtered queries finished | since_start: 1.0 minute, 39.32 seconds | since_last: 9.89 seconds :: 


' :: "02_stg".stg_unified_works_filtered queries finished | since_start: 1.0 minute, 39.32 seconds | since_last: 9.89 seconds :: '

In [6]:
table_name = 'unified_works'
db_name = '03_core'
timelogger = utils.TimeLogger()
utils.create_table_from_sql_file(
    database_name = db_name,
    table_name = table_name,
    overwrite_strategy='overwrite', # options: fail, overwrite, ignore
    wait=True,
    s3_parent_target_path=config.S3_CORE_DATA_PATH
)
timelogger.log(f'"{db_name}".{table_name} created')

utils.pd_set_options(cols=100)
display(wr.athena.read_sql_query(f"""SELECT * FROM "{db_name}".{table_name} LIMIT 3 """, db_name))
display(wr.athena.read_sql_query(f"""SELECT COUNT(*) AS c FROM "{db_name}".{table_name} """, db_name))
timelogger.log(f'"{db_name}".{table_name} queries finished')

 :: :: TIMELOGGER STARTED :: | since_start: 0.00 seconds | since_last: 0.00 seconds :: 
Table 03_core.unified_works already exists. Overwriting since overwrite_strategy=="overwrite".
Deleting table from Glue Catalog 03_core unified_works
Deleting S3 objects from s3://sagemaker-research-methodology-extraction/01_data/03_core/unified_works/
s3_parent_target_path:  s3://sagemaker-research-methodology-extraction/01_data/03_core
 :: "03_core".unified_works created | since_start: 39.92 seconds | since_last: 39.92 seconds :: 


Unnamed: 0,id,topic_index,topic_display_name,subfield_index,subfield_display_name,title,abstract,fulltext,subset
0,265115895,,Media and Digital Communication,,Computer Networks and Communications,E-Commerce as a Source of Revenue in Spanish Digital News Media,This study analyzes e-commerce strategies in Spanish active digital news outlets comprehensively...,\nIntroduction\n\nThe digital era has ushered in significant challenges to the traditional reven...,train
1,56392175,,Educational Research and Pedagogy,,Computer Networks and Communications,A Study on Textbook Evaluation Criteria for the Teaching of Culture in English Language Teaching,Textbooks play a significant role in language teaching and learning. If language teaching is clo...,\nIntroduction\n\nThis paper addresses the issue of culture in English-language teaching and in ...,validation
2,218505818,,Computational Drug Discovery Methods,,Computational Theory and Mathematics,Repurposing strategies on pyridazinone-based series by pharmacophore- and structure-driven scree...,Abstract We report here in silico repurposing studies on 52 new pyridazinone-based small-molecul...,"\nChemistry\n\nAnalogues 1a,b and 2a,b were prepared from the 4,5-dichloro-3(2H)-pyridazinone 22...",train


Unnamed: 0,c
0,396430


 :: "03_core".unified_works queries finished | since_start: 49.79 seconds | since_last: 9.86 seconds :: 


' :: "03_core".unified_works queries finished | since_start: 49.79 seconds | since_last: 9.86 seconds :: '

In [7]:
timelogger = utils.TimeLogger()
db_name = '03_core'
table_names = ['unified_works_train', 'unified_works_test', 'unified_works_validation']
for table_name in table_names:
    utils.create_table_from_sql_file(
        database_name = db_name,
        table_name = table_name,
        overwrite_strategy='overwrite', # options: fail, overwrite, ignore
        wait=True,
        s3_parent_target_path=config.S3_CORE_DATA_PATH
    )
    timelogger.log(f'"{db_name}".{table_name} created')
    
    utils.pd_set_options(cols=100)
    display(wr.athena.read_sql_query(f"""SELECT * FROM "{db_name}".{table_name} LIMIT 3 """, db_name))
    display(wr.athena.read_sql_query(f"""SELECT COUNT(*) AS c FROM "{db_name}".{table_name} """, db_name))
    timelogger.log(f'"{db_name}".{table_name} queries finished')

 :: :: TIMELOGGER STARTED :: | since_start: 0.00 seconds | since_last: 0.00 seconds :: 
Table 03_core.unified_works_train already exists. Overwriting since overwrite_strategy=="overwrite".
Deleting table from Glue Catalog 03_core unified_works_train
Deleting S3 objects from s3://sagemaker-research-methodology-extraction/01_data/03_core/unified_works_train/
s3_parent_target_path:  s3://sagemaker-research-methodology-extraction/01_data/03_core
 :: "03_core".unified_works_train created | since_start: 33.93 seconds | since_last: 33.93 seconds :: 


Unnamed: 0,id,topic_index,topic_display_name,subfield_index,subfield_display_name,title,abstract,fulltext,subset
0,226848068,,AI in cancer detection,,Artificial Intelligence,Breast Mass Classification Using eLFA Algorithm Based on CRNN Deep Learning Model,Breast cancer is known to be common in many developed countries. It is reported as the most comm...,\nI. INTRODUCTION\n\nCancer is one of the most dangerous diseases affecting humankind. It indica...,train
1,270038919,,AI in cancer detection,,Artificial Intelligence,Practical Application of Deep Learning in Diagnostic Neuropathology—Reimagining a Histological A...,"Simple Summary Technological and scientific innovations, from genetic sequencing to digital path...",\nIntroduction\n\nRecent years in the field of diagnostic neuropathology have shown an increasin...,train
2,264453776,,AI in cancer detection,,Artificial Intelligence,Lightweight Histological Tumor Classification Using a Joint Sparsity-Quantization Aware Training...,Cancer decision-making is a complex process that can be exacerbated by the limited availability ...,\nI. INTRODUCTION\n\nThe widespread use of deep learning models has many implications across mul...,train


Unnamed: 0,c
0,316885


 :: "03_core".unified_works_train queries finished | since_start: 43.01 seconds | since_last: 9.08 seconds :: 
Table 03_core.unified_works_test already exists. Overwriting since overwrite_strategy=="overwrite".
Deleting table from Glue Catalog 03_core unified_works_test
Deleting S3 objects from s3://sagemaker-research-methodology-extraction/01_data/03_core/unified_works_test/
s3_parent_target_path:  s3://sagemaker-research-methodology-extraction/01_data/03_core
 :: "03_core".unified_works_test created | since_start: 52.61 seconds | since_last: 9.60 seconds :: 


Unnamed: 0,id,topic_index,topic_display_name,subfield_index,subfield_display_name,title,abstract,fulltext,subset
0,225062121,,Speech Recognition and Synthesis,,Artificial Intelligence,Rediscovering the Slavic Continuum in Representations Emerging from Neural Models of Spoken Lang...,"Deep neural networks have been employed for various spoken language recognition tasks, including...",\nIntroduction\n\nThe relationship between a group of human languages can be characterized acros...,test
1,273549551,,Speech Recognition and Synthesis,,Artificial Intelligence,STTATTS: Unified Speech-To-Text And Text-To-Speech Model,"Speech recognition and speech synthesis models are typically trained separately, each with its o...","\nIntroduction\n\nFundamentally, text and speech are different representations of similar inform...",test
2,268691349,,Speech Recognition and Synthesis,,Artificial Intelligence,LISA: Layerwise Importance Sampling for Memory-Efficient Large Language Model Fine-Tuning,The machine learning community has witnessed impressive advancements since large language models...,\nIntroduction\n\nLarge language models (LLMs) like ChatGPT excel in tasks such as writing docum...,test


Unnamed: 0,c
0,39792


 :: "03_core".unified_works_test queries finished | since_start: 1.0 minute, 1.45 seconds | since_last: 8.84 seconds :: 
Table 03_core.unified_works_validation already exists. Overwriting since overwrite_strategy=="overwrite".
Deleting table from Glue Catalog 03_core unified_works_validation
Deleting S3 objects from s3://sagemaker-research-methodology-extraction/01_data/03_core/unified_works_validation/
s3_parent_target_path:  s3://sagemaker-research-methodology-extraction/01_data/03_core
 :: "03_core".unified_works_validation created | since_start: 1.0 minute, 10.94 seconds | since_last: 9.49 seconds :: 


Unnamed: 0,id,topic_index,topic_display_name,subfield_index,subfield_display_name,title,abstract,fulltext,subset
0,268480549,,Hate Speech and Cyberbullying Detection,,Artificial Intelligence,Enhancing Automated Hate Speech Detection: Addressing Islamophobia and Freedom of Speech in Onli...,This paper emphasizes the necessity of a precise definition of Islamophobia within the realm of ...,"\nI. INTRODUCTION\n\nIslamophobia, which encompasses fear, prejudice, or discrimination against ...",validation
1,236486143,,Hate Speech and Cyberbullying Detection,,Artificial Intelligence,Fine-grained Classification of Political Bias in German News: A Data Set and Initial Experiments,We present a data set consisting of German news articles labeled for political bias on a five-po...,\nIntroduction\n\nThe social web and social media networks have received an ever-increasing amou...,validation
2,254955993,,Interconnection Networks and Systems,,Computer Networks and Communications,Proficient matrix codes for error detection and correctionin 8-port network on chip routers,This paper verifies the applicability of the proposed code to dynamic Network on Chips that have...,"\nINTRODUCTION\n\nAs a result of rapid advancements in very large-scale integration (VLSI), bill...",validation


Unnamed: 0,c
0,39753


 :: "03_core".unified_works_validation queries finished | since_start: 1.0 minute, 18.72 seconds | since_last: 7.78 seconds :: 


In [8]:
display(wr.athena.read_sql_query(f"""
    SELECT 
        subset,
        COUNT(*) AS c,
        COUNT(*) * 100.0 / (SELECT COUNT(*) FROM "03_core".unified_works) AS p
    FROM 
        "03_core".unified_works_train
    GROUP BY
        subset
    
    UNION ALL

    SELECT 
        subset,
        COUNT(*) AS c,
        COUNT(*) * 100.0 / (SELECT COUNT(*) FROM "03_core".unified_works) AS p
    FROM 
        "03_core".unified_works_test
    GROUP BY
        subset

    UNION ALL

    SELECT 
        subset,
        COUNT(*) AS c,
        COUNT(*) * 100.0 / (SELECT COUNT(*) FROM "03_core".unified_works) AS p
    FROM 
        "03_core".unified_works_validation
    GROUP BY
        subset
""", db_name))

Unnamed: 0,subset,c,p
0,validation,39753,10.027748
1,test,39792,10.037585
2,train,316885,79.934667


In [9]:
table_name = 'topics'
db_name = '03_core'
timelogger = utils.TimeLogger()
utils.create_table_from_sql_file(
    database_name = db_name,
    table_name = table_name,
    overwrite_strategy='overwrite', # options: fail, overwrite, ignore
    wait=True,
    s3_parent_target_path=config.S3_CORE_DATA_PATH
)
timelogger.log(f'"{db_name}".{table_name} created')

utils.pd_set_options(cols=300)
display(wr.athena.read_sql_query(f"""SELECT * FROM "{db_name}".{table_name} LIMIT 30 """, db_name))
display(wr.athena.read_sql_query(f"""SELECT COUNT(*) AS c FROM "{db_name}".{table_name} """, db_name))
timelogger.log(f'"{db_name}".{table_name} queries finished')

 :: :: TIMELOGGER STARTED :: | since_start: 0.00 seconds | since_last: 0.00 seconds :: 
Table 03_core.topics already exists. Overwriting since overwrite_strategy=="overwrite".
Deleting table from Glue Catalog 03_core topics
Deleting S3 objects from s3://sagemaker-research-methodology-extraction/01_data/03_core/topics/
s3_parent_target_path:  s3://sagemaker-research-methodology-extraction/01_data/03_core
 :: "03_core".topics created | since_start: 3.59 seconds | since_last: 3.59 seconds :: 


Unnamed: 0,topic_index,topic_original_id,topic_display_name,topic_count,subfield_original_id,subfield_display_name
0,6,10036,Advanced Neural Network Applications,6193,1707,Computer Vision and Pattern Recognition


Unnamed: 0,c
0,1


 :: "03_core".topics queries finished | since_start: 11.19 seconds | since_last: 7.60 seconds :: 


' :: "03_core".topics queries finished | since_start: 11.19 seconds | since_last: 7.60 seconds :: '

In [10]:
table_name = 'subfields'
db_name = '03_core'
timelogger = utils.TimeLogger()
utils.create_table_from_sql_file(
    database_name = db_name,
    table_name = table_name,
    overwrite_strategy='overwrite', # options: fail, overwrite, ignore
    wait=True,
    s3_parent_target_path=config.S3_CORE_DATA_PATH
)
timelogger.log(f'"{db_name}".{table_name} created')

utils.pd_set_options(cols=300)
display(wr.athena.read_sql_query(f"""SELECT * FROM "{db_name}".{table_name} LIMIT 3 """, db_name))
display(wr.athena.read_sql_query(f"""SELECT COUNT(*) AS c FROM "{db_name}".{table_name} """, db_name))
timelogger.log(f'"{db_name}".{table_name} queries finished')

 :: :: TIMELOGGER STARTED :: | since_start: 0.00 seconds | since_last: 0.00 seconds :: 
Table 03_core.subfields already exists. Overwriting since overwrite_strategy=="overwrite".
Deleting table from Glue Catalog 03_core subfields
Deleting S3 objects from s3://sagemaker-research-methodology-extraction/01_data/03_core/subfields/
s3_parent_target_path:  s3://sagemaker-research-methodology-extraction/01_data/03_core
 :: "03_core".subfields created | since_start: 2.99 seconds | since_last: 2.99 seconds :: 


Unnamed: 0,subfield_index,subfield_original_id,subfield_display_name,subfield_count
0,0,1707,Computer Vision and Pattern Recognition,6193


Unnamed: 0,c
0,1


 :: "03_core".subfields queries finished | since_start: 10.82 seconds | since_last: 7.84 seconds :: 


' :: "03_core".subfields queries finished | since_start: 10.82 seconds | since_last: 7.84 seconds :: '

In [11]:
utils.pd_set_options(cols=500)

wr.athena.read_sql_query("""
WITH
stg_unified_works_metadata_02_joined_to_openalex_ AS (
    SELECT * FROM "02_stg".stg_unified_works_metadata_02_joined_to_openalex
),
stg_semanticscholar_combined_works_content_ AS (
SELECT * FROM "02_stg".stg_semanticscholar_combined_works_content
),
metadata_filtered AS (
    SELECT 
        * 
    FROM
        stg_unified_works_metadata_02_joined_to_openalex_
    WHERE
        openalex_language='en' AND
        license_allows_derivative_reuse=1
),
numbered AS (
    SELECT
        *,
        NTILE(10) OVER( PARTITION BY openalex_primary_topic_id ORDER BY random()) AS bucket_10p
    FROM
        metadata_filtered
)
SELECT id_semanticscholar, bucket_10p FROM numbered WHERE openalex_primary_topic_id=13932 ORDER BY id_semanticscholar
 """, '01_raw')

Unnamed: 0,id_semanticscholar,bucket_10p
0,18991472,1
1,54460202,2
2,54460599,3
3,54462925,1
4,54574519,4
5,145582524,8
6,159229472,10
7,167039776,6
8,212995807,7
9,216341058,5


In [4]:
table_name = 'stg_filtered_work_section_annotations'
db_name = '02_stg'
timelogger = utils.TimeLogger()
utils.create_table_from_sql_file(
    database_name=db_name,
    table_name=table_name,
    overwrite_strategy='overwrite', # options: fail, overwrite, ignore
    wait=True,
    # s3_parent_target_path=config.S3_CORE_DATA_PATH
)
timelogger.log(f'"{db_name}".{table_name} created')

utils.pd_set_options(cols=300)
display(wr.athena.read_sql_query(f"""SELECT * FROM "{db_name}".{table_name} LIMIT 3 """, db_name))
display(wr.athena.read_sql_query(f"""SELECT COUNT(*) AS c FROM "{db_name}".{table_name} """, db_name))
timelogger.log(f'"{db_name}".{table_name} queries finished')

 :: :: TIMELOGGER STARTED :: | since_start: 0.00 seconds | since_last: 0.00 seconds :: 
Deleting S3 objects from s3://sagemaker-research-methodology-extraction/01_data/02_stg/stg_filtered_work_section_annotations/
s3_parent_target_path:  s3://sagemaker-research-methodology-extraction/01_data/02_stg
 :: "02_stg".stg_filtered_work_section_annotations created | since_start: 41.60 seconds | since_last: 41.60 seconds :: 


Unnamed: 0,work_id,section_index,section_type_index,n,section_start,section_end,section_type
0,842983,19,6,,15340,15379,header
1,842983,20,14,,15381,16284,paragraph
2,842983,21,15,,16286,16380,paragraph


Unnamed: 0,c
0,35255622


 :: "02_stg".stg_filtered_work_section_annotations queries finished | since_start: 48.75 seconds | since_last: 7.15 seconds :: 


' :: "02_stg".stg_filtered_work_section_annotations queries finished | since_start: 48.75 seconds | since_last: 7.15 seconds :: '

In [11]:
table_name = 'stg_filtered_work_sections'
db_name = '02_stg'
timelogger = utils.TimeLogger()
utils.create_table_from_sql_file(
    database_name=db_name,
    table_name=table_name,
    overwrite_strategy='overwrite', # options: fail, overwrite, ignore
    wait=True,
    # s3_parent_target_path=config.S3_CORE_DATA_PATH
)
timelogger.log(f'"{db_name}".{table_name} created')

utils.pd_set_options(cols=300)
display(wr.athena.read_sql_query(f"""SELECT * FROM "{db_name}".{table_name} LIMIT 3 """, db_name))
display(wr.athena.read_sql_query(f"""SELECT COUNT(*) AS c FROM "{db_name}".{table_name} """, db_name))
timelogger.log(f'"{db_name}".{table_name} queries finished')

 :: :: TIMELOGGER STARTED :: | since_start: 0.00 seconds | since_last: 0.00 seconds :: 
Table 02_stg.stg_filtered_work_sections already exists. Overwriting since overwrite_strategy=="overwrite".
Deleting table from Glue Catalog 02_stg stg_filtered_work_sections
Deleting S3 objects from s3://sagemaker-research-methodology-extraction/01_data/02_stg/stg_filtered_work_sections/
s3_parent_target_path:  s3://sagemaker-research-methodology-extraction/01_data/02_stg
 :: "02_stg".stg_filtered_work_sections created | since_start: 44.64 seconds | since_last: 44.64 seconds :: 


Unnamed: 0,work_id,section_index,section_type_index,n,section_start,section_end,section_type,section_text,section_text_length
0,213004410,26,23,,12542,13340,paragraph,", at the design points, X = {x 1 , . . . , x n }, can be estimated. We take an empirical Bayes (EB) approach to estimation as a compromise between (i) computational cost and (ii) comprehensive uncertainty quantification. A fully Bayesian approach, via MCMC, allows us to quantify and propagate un...",798
1,213004410,27,24,,13342,13635,paragraph,"We assign priors β ∼ N (b, B) and β V ∼ N (b V , B V ), marginalise out the β coefficients and obtain a MAP estimate of the GP covariance structure. After integrating out β V we can write the joint density of log λ 2 (X) and log λ 2 (X ), where X and X are collections of simulator inputs, as",293
2,213004410,28,25,,13637,13717,paragraph,where Θ −β denotes the vector Θ with β and β V removed. We have also introduced,80


Unnamed: 0,c
0,35334742


 :: "02_stg".stg_filtered_work_sections queries finished | since_start: 53.04 seconds | since_last: 8.40 seconds :: 


' :: "02_stg".stg_filtered_work_sections queries finished | since_start: 53.04 seconds | since_last: 8.40 seconds :: '

In [12]:
table_name = 'stg_unified_works_filtered_with_section_stats'
db_name = '02_stg'
timelogger = utils.TimeLogger()
utils.create_table_from_sql_file(
    database_name=db_name,
    table_name=table_name,
    overwrite_strategy='overwrite', # options: fail, overwrite, ignore
    wait=True,
    # s3_parent_target_path=config.S3_CORE_DATA_PATH
)
timelogger.log(f'"{db_name}".{table_name} created')

utils.pd_set_options(cols=300)
display(wr.athena.read_sql_query(f"""SELECT * FROM "{db_name}".{table_name} LIMIT 3 """, db_name))
display(wr.athena.read_sql_query(f"""SELECT COUNT(*) AS c FROM "{db_name}".{table_name} """, db_name))
timelogger.log(f'"{db_name}".{table_name} queries finished')

 :: :: TIMELOGGER STARTED :: | since_start: 0.00 seconds | since_last: 0.00 seconds :: 
Table 02_stg.stg_unified_works_filtered_with_section_stats already exists. Overwriting since overwrite_strategy=="overwrite".
Deleting table from Glue Catalog 02_stg stg_unified_works_filtered_with_section_stats
Deleting S3 objects from s3://sagemaker-research-methodology-extraction/01_data/02_stg/stg_unified_works_filtered_with_section_stats/
s3_parent_target_path:  s3://sagemaker-research-methodology-extraction/01_data/02_stg
 :: "02_stg".stg_unified_works_filtered_with_section_stats created | since_start: 33.18 seconds | since_last: 33.18 seconds :: 


Unnamed: 0,id_semanticscholar,id_mag,id_doi,id_arxiv,publication_year,publication_date,license,license_allows_derivative_reuse,source_url,has_id_mag,has_id_doi,has_id_mag_or_doi,openalex_id_openalex,openalex_id_doi,openalex_language,openalex_primary_topic_id,openalex_primary_topic_display_name,openalex_primary_topic_count,openalex_primary_topic_index,openalex_primary_topic_subfield_id,openalex_primary_topic_subfield_display_name,openalex_primary_topic_subfield_count,openalex_primary_topic_subfield_index,openalex_primary_topic_field_id,openalex_primary_topic_field_display_name,openalex_primary_topic_domain_id,openalex_primary_topic_domain_display_name,openalex_joined_on,title,content_abstract,content_text,annotations_paragraph,annotations_section_header,subset,number_of_sections,number_of_headers,number_of_paragraphs,has_sections,has_headers,has_paragraphs
0,234744282,3148682774,10.33851/jmis.2021.8.1.75,,2021,2021-03-31,CCBYNC,1,https://doi.org/10.33851/JMIS.2021.8.1.75,1,1,1,3148682774,10.33851/jmis.2021.8.1.75,en,12799,Mobile and Web Applications,,,1710,Information Systems,,,17,Computer Science,3,Physical Sciences,doi,Improving Student's Design Prototyping Skills using Interactive Prototyping Tool,,"\nI. INTRODUCTION\n\nHuman-Computer Interaction (HCI) is a field that repeats the process of designing, prototyping, and evaluating a product's user interface (UI). An important aspect of HCI is user satisfaction. For this reason, HCI is an interdisciplinary discipline that combines computer sci...","[{""attributes"":null,""end"":1004,""start"":18},{""attributes"":null,""end"":2112,""start"":1006},{""attributes"":null,""end"":2328,""start"":2114},{""attributes"":null,""end"":2425,""start"":2344},{""attributes"":null,""end"":3762,""start"":2476},{""attributes"":null,""end"":4855,""start"":3764},{""attributes"":null,""end"":5446,""st...","[{""attributes"":null,""end"":16,""start"":1},{""attributes"":null,""end"":2342,""start"":2330},{""attributes"":null,""end"":2474,""start"":2427},{""attributes"":null,""end"":6814,""start"":6750},{""attributes"":null,""end"":13286,""start"":13272}]",train,28,5,23,1,1,1
1,158451629,2792193016,10.3390/land7010039,,2018,2018-03-20,CCBY,1,https://doi.org/10.3390/LAND7010039,1,1,1,2792193016,10.3390/land7010039,en,12805,Cognitive Science and Mapping,,,1702,Artificial Intelligence,,,17,Computer Science,3,Physical Sciences,doi,"System properties determine food security and biodiversity outcomes at landscape scale: a case study from West Flores, Indonesia",The food-biodiversity nexus is a concept that defines and characterizes the complex interactions between agricultural systems and biodiversity conservation. Here we use a social-ecological systems approach that combines fuzzy cognitive mapping and graph theoretic analyses to uncover system prope...,"\nIntroduction\n\nAchieving food security for all people while conserving biodiversity and ecosystem services are two, strongly linked, challenges fundamental to securing global sustainability. Global leadership has committed to achieving food security and conserving biodiversity, for example, b...","[{""attributes"":null,""end"":633,""start"":15},{""attributes"":null,""end"":1952,""start"":635},{""attributes"":null,""end"":2672,""start"":1954},{""attributes"":null,""end"":4583,""start"":2674},{""attributes"":null,""end"":5565,""start"":4585},{""attributes"":null,""end"":6223,""start"":5567},{""attributes"":null,""end"":7916,""star...","[{""attributes"":{""n"":""1.""},""end"":13,""start"":1},{""attributes"":{""n"":""2.""},""end"":10413,""start"":10392},{""attributes"":{""n"":""2.1.""},""end"":10428,""start"":10415},{""attributes"":{""n"":""2.""},""end"":11629,""start"":11608},{""attributes"":{""n"":""2.1.""},""end"":11644,""start"":11631},{""attributes"":{""n"":""2.2.""},""end"":14630...",test,103,30,73,1,1,1
2,218820716,3017117866,10.1002/acs.3115,,2020,2020-04-16,CCBY,1,https://doi.org/10.1002/acs.3115,1,1,1,3017117866,10.1002/acs.3115,en,12794,Adaptive Dynamic Programming Control,,,1703,Computational Theory and Mathematics,,,17,Computer Science,3,Physical Sciences,doi,Online optimal and adaptive integral tracking control for varying discrete‐time systems using reinforcement learning,"Conventional closed‐form solution to the optimal control problem using optimal control theory is only available under the assumption that there are known system dynamics/models described as differential equations. Without such models, reinforcement learning (RL) as a candidate technique has been...","\nINTRODUCTION\n\nReinforcement learning (RL) is a type of machine learning technique that has been used extensively in the area of computing and artificial intelligence to solve complex optimization problems. 1,2 Due to its successes, there have been concerted efforts by researchers in the cont...","[{""attributes"":null,""end"":5115,""start"":15},{""attributes"":null,""end"":6477,""start"":5117},{""attributes"":null,""end"":7058,""start"":6479},{""attributes"":null,""end"":7168,""start"":7081},{""attributes"":null,""end"":7339,""start"":7170},{""attributes"":null,""end"":7347,""start"":7341},{""attributes"":null,""end"":7485,""st...","[{""attributes"":{""n"":""1""},""end"":13,""start"":1},{""attributes"":{""n"":""2""},""end"":7079,""start"":7060},{""attributes"":null,""end"":9100,""start"":9093},{""attributes"":null,""end"":10168,""start"":10161},{""attributes"":{""n"":""3""},""end"":11416,""start"":11340},{""attributes"":{""n"":""4""},""end"":12874,""start"":12799},{""attribut...",train,155,23,132,1,1,1


Unnamed: 0,c
0,396430


 :: "02_stg".stg_unified_works_filtered_with_section_stats queries finished | since_start: 41.23 seconds | since_last: 8.05 seconds :: 


' :: "02_stg".stg_unified_works_filtered_with_section_stats queries finished | since_start: 41.23 seconds | since_last: 8.05 seconds :: '

In [13]:
table_name = 'stg_filtered_work_sections_with_headers'
db_name = '02_stg'
timelogger = utils.TimeLogger()
utils.create_table_from_sql_file(
    database_name=db_name,
    table_name=table_name,
    overwrite_strategy='overwrite', # options: fail, overwrite, ignore
    wait=True,
    # s3_parent_target_path=config.S3_CORE_DATA_PATH
)
timelogger.log(f'"{db_name}".{table_name} created')

utils.pd_set_options(cols=300)
display(wr.athena.read_sql_query(f"""SELECT * FROM "{db_name}".{table_name} LIMIT 3 """, db_name))
display(wr.athena.read_sql_query(f"""SELECT COUNT(*) AS c FROM "{db_name}".{table_name} """, db_name))
timelogger.log(f'"{db_name}".{table_name} queries finished')

 :: :: TIMELOGGER STARTED :: | since_start: 0.00 seconds | since_last: 0.00 seconds :: 
Table 02_stg.stg_filtered_work_sections_with_headers already exists. Overwriting since overwrite_strategy=="overwrite".
Deleting table from Glue Catalog 02_stg stg_filtered_work_sections_with_headers
Deleting S3 objects from s3://sagemaker-research-methodology-extraction/01_data/02_stg/stg_filtered_work_sections_with_headers/
s3_parent_target_path:  s3://sagemaker-research-methodology-extraction/01_data/02_stg
 :: "02_stg".stg_filtered_work_sections_with_headers created | since_start: 34.27 seconds | since_last: 34.27 seconds :: 


Unnamed: 0,work_id,section_index,section_type_index,n,section_start,section_end,section_type,section_text,section_text_length,work_has_sections,work_has_headers,work_has_paragraphs,work_number_of_sections,work_number_of_headers,work_number_of_paragraphs
0,263608554,1,1,,1,16,header,I. INTRODUCTION,15,1,1,1,125,5,120
1,263608554,2,1,,18,656,paragraph,"V Irtual constraints are relations on the configuration vari- ables of a control system which are imposed through feedback control and the action of actuators, instead of through physical connections such as gears or contact conditions with the environment. The advantage of working with virtual ...",638,1,1,1,125,5,120
2,263608554,3,2,,658,1546,paragraph,"Virtual holonomic constraints have been studied over the past few years in a variety of contexts, such as motion planning and control [11], [24], [18], [29] and biped locomotion where it was used to achieve a desired walking gait [8], [28]. Virtual nonholonomic constraints are a class of virtual...",888,1,1,1,125,5,120


Unnamed: 0,c
0,35450809


 :: "02_stg".stg_filtered_work_sections_with_headers queries finished | since_start: 43.44 seconds | since_last: 9.17 seconds :: 


' :: "02_stg".stg_filtered_work_sections_with_headers queries finished | since_start: 43.44 seconds | since_last: 9.17 seconds :: '

In [15]:
table_name = 'stg_filtered_work_chapters'
db_name = '02_stg'
timelogger = utils.TimeLogger()
utils.create_table_from_sql_file(
    database_name=db_name,
    table_name=table_name,
    overwrite_strategy='overwrite', # options: fail, overwrite, ignore
    wait=True,
    # s3_parent_target_path=config.S3_CORE_DATA_PATH
)
timelogger.log(f'"{db_name}".{table_name} created')

utils.pd_set_options(cols=300)
display(wr.athena.read_sql_query(f"""SELECT * FROM "{db_name}".{table_name} LIMIT 3 """, db_name))
display(wr.athena.read_sql_query(f"""SELECT COUNT(*) AS c FROM "{db_name}".{table_name} """, db_name))
timelogger.log(f'"{db_name}".{table_name} queries finished')

 :: :: TIMELOGGER STARTED :: | since_start: 0.00 seconds | since_last: 0.00 seconds :: 
Table 02_stg.stg_filtered_work_chapters already exists. Overwriting since overwrite_strategy=="overwrite".
Deleting table from Glue Catalog 02_stg stg_filtered_work_chapters
Deleting S3 objects from s3://sagemaker-research-methodology-extraction/01_data/02_stg/stg_filtered_work_chapters/
s3_parent_target_path:  s3://sagemaker-research-methodology-extraction/01_data/02_stg
 :: "02_stg".stg_filtered_work_chapters created | since_start: 51.02 seconds | since_last: 51.02 seconds :: 


Unnamed: 0,work_id,section_indices,section_type_indices,header_index,number_of_sections_in_block,section_starts,min_section_start,section_ends,max_section_end,header_text,header_text_length,block_text,block_text_length
0,1979,"[202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226]","[31, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195]",31,25,"[72254, 72267, 73315, 74244, 74427, 74530, 74864, 75620, 75811, 76627, 76712, 77028, 77108, 77399, 77433, 77684, 77918, 78090, 78095, 78100, 78106, 78111, 78116, 78121, 78127]",72254,"[72265, 73313, 74242, 74425, 74528, 74862, 75618, 75809, 76625, 76710, 77026, 77106, 77397, 77431, 77682, 77916, 78086, 78091, 78096, 78102, 78107, 78112, 78117, 78123, 78128]",78128,Conclusions,11,"Conclusions\n\nJust as it is often desirable to have guarantees of correctness for a program, in many plausible contexts it would be highly desirable to have an automatic programming system o er some formal guarantees of correctness. The topic of this paper is the learnability of recursive logic...",5858
1,2134,"[1, 2, 3, 4]","[1, 1, 2, 3]",1,4,"[1, 15, 1330, 1568]",1,"[13, 1328, 1566, 2495]",2495,Introduction,12,"Introduction\n\nWhat does it mean to analyze or -more ambitiously -to understand a text? Over the years, Artificial Intelligence and Computational Linguistics have responded in quite different ways to this question. The present paper argues in favour of a text-technologicallyinspired multi-level...",2494
2,2134,[5],[2],2,1,[2497],2497,[2501],2501,LDV-,4,LDV-,4


Unnamed: 0,c
0,6841474


 :: "02_stg".stg_filtered_work_chapters queries finished | since_start: 1.0 minute, 0.11 seconds | since_last: 9.09 seconds :: 


' :: "02_stg".stg_filtered_work_chapters queries finished | since_start: 1.0 minute, 0.11 seconds | since_last: 9.09 seconds :: '