In [1]:
import requests
import boto3
import json
import os
import sys
import argparse
import importlib
import transformers
import torch
import pathlib
import awswrangler as wr
from IPython.display import display
from sagemaker.huggingface.processing import HuggingFaceProcessor
from sagemaker.sklearn.processing import SKLearnProcessor
from sagemaker.processing import FrameworkProcessor
from sagemaker.sklearn.estimator import SKLearn
from sagemaker.workflow.steps import ProcessingStep
from sagemaker.workflow.pipeline_context import PipelineSession
from sagemaker.processing import ProcessingInput, ProcessingOutput
from sagemaker.session import get_execution_role


# Adding ../01_modules or ./01_modules to the system path so that we can load modules from 
# there as well
if '__file__' in globals():
    script_dir = pathlib.Path(__file__).parent.resolve()
else:
    script_dir = pathlib.Path().absolute()
modules_path_in_dev = os.path.abspath(os.path.join(script_dir, '..', '01_modules'))
modules_path_in_prod = os.path.abspath(os.path.join(script_dir, '01_modules'))
if os.path.exists(modules_path_in_dev):
    sys.path.append(modules_path_in_dev)
if os.path.exists(modules_path_in_prod):
    sys.path.append(modules_path_in_prod)


# # Jupyter only reads a local module the first time after 
# # kernel start. Re-running a cell with 
# # "from mymodulename import *" would not change
# # anything, even if the imported module has since changed.
# # As a workaround, we need to directly load the module, 
# # use importlib.reload to reload it and then import * 
import utils
_ = importlib.reload(utils)
import config
_ = importlib.reload(config) 


sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/sagemaker-user/.config/sagemaker/config.yaml
config.py loaded: v0.1
utils.py loaded: v0.2.12
utils.py loaded: v0.2.12
config.py loaded: v0.1


In [2]:
if False: # do not run this again, takes a day
    execution_role = get_execution_role()
    source_dir = os.path.abspath(os.path.join(os.getcwd(), '..', '..'))
    print('source_dir:', source_dir)
    sklearn_processor = FrameworkProcessor(
        estimator_cls=SKLearn,
        framework_version='1.2-1', # The newest supported version by sagemaker
        instance_type='ml.c7i.16xlarge',
        instance_count=1,
        base_job_name=f'openalex_works_reduction'.replace('_','-'),
        role=execution_role
    )
    
    step_args = sklearn_processor.run(
        code='src/03_transformation/03_11_transformation_openalex_works_reduction.py',
        source_dir=source_dir,
        inputs=[], # We are not using automatic input-output mapping, instead we handle everything in the script directly on S3
        outputs=[],
        arguments=[
            '--runtype', 'prod',
            '--file-max-limit', '10000',
        ],
        wait=True
    )

In [30]:
table_name = 'base_openalex_works_reduced_filtered'
timelogger = utils.TimeLogger()
utils.create_table_from_sql_file(
    database_name = '02_stg',
    table_name = table_name,
    overwrite_strategy='overwrite', # options: fail, overwrite, ignore
    wait=True,
)
timelogger.log(f'"02_stg".{table_name} created')

utils.pd_set_options(cols=500)
display(wr.athena.read_sql_query(f"""SELECT * FROM "02_stg".{table_name} LIMIT 3 """, '02_stg'))
display(wr.athena.read_sql_query(f"""SELECT COUNT(*) AS c FROM "02_stg".{table_name} """, '02_stg'))
timelogger.log(f'"02_stg".{table_name} queries finished')

 :: :: TIMELOGGER STARTED :: | since_start: 0.00 seconds | since_last: 0.00 seconds :: 
Deleting S3 objects from s3://sagemaker-research-methodology-extraction/01_data/02_stg/base_openalex_works_reduced_filtered/
s3_parent_target_path:  s3://sagemaker-research-methodology-extraction/01_data/02_stg
 :: "02_stg".base_openalex_works_reduced_filtered created | since_start: 42.36 seconds | since_last: 42.36 seconds :: 


Unnamed: 0,id_openalex,id_doi,title,language,primary_topic_id,primary_topic_display_name,primary_topic_subfield_id,primary_topic_subfield_display_name,primary_topic_field_id,primary_topic_field_display_name,primary_topic_domain_id,primary_topic_domain_display_name
0,2184395120,,AN APPROACH FOR COMPRESSING DIGITAL IMAGES BY USING RUN LENGTH ENCODING,en,T10901,Advanced Data Compression Techniques,1707,Computer Vision and Pattern Recognition,17,Computer Science,3,Physical Sciences
1,2184397778,,LEARNINGMINIMUM VOLUME SETSWITH SUPPORTVECTOR MACHINES,en,T11512,Anomaly Detection Techniques and Applications,1702,Artificial Intelligence,17,Computer Science,3,Physical Sciences
2,2184397944,,Analyzing thePerformance ofVoice over Internet Protocol ina 3GNetwork,en,T10575,Wireless Communication Networks Research,1705,Computer Networks and Communications,17,Computer Science,3,Physical Sciences


Unnamed: 0,c
0,13242469


 :: "02_stg".base_openalex_works_reduced_filtered queries finished | since_start: 50.44 seconds | since_last: 8.08 seconds :: 


' :: "02_stg".base_openalex_works_reduced_filtered queries finished | since_start: 50.44 seconds | since_last: 8.08 seconds :: '

In [3]:
table_name = 'base_semanticscholar_s2orcv2'
timelogger = utils.TimeLogger()
utils.create_table_from_sql_file(
    database_name = '02_stg',
    table_name = table_name,
    overwrite_strategy='overwrite', # options: fail, overwrite, ignore
    wait=True,
)
timelogger.log(f'"02_stg".{table_name} created')

utils.pd_set_options(cols=500)
display(wr.athena.read_sql_query(f"""SELECT * FROM "02_stg".{table_name} LIMIT 3 """, '02_stg'))
display(wr.athena.read_sql_query(f"""SELECT COUNT(*) AS c FROM "02_stg".{table_name} """, '02_stg'))
timelogger.log(f'"02_stg".{table_name} queries finished')

 :: :: TIMELOGGER STARTED :: | since_start: 0.00 seconds | since_last: 0.00 seconds :: 
Table 02_stg.base_semanticscholar_s2orcv2 already exists. Overwriting since overwrite_strategy=="overwrite".
Deleting table from Glue Catalog 02_stg base_semanticscholar_s2orcv2
Deleting S3 objects from s3://sagemaker-research-methodology-extraction/01_data/02_stg/base_semanticscholar_s2orcv2/
s3_parent_target_path:  s3://sagemaker-research-methodology-extraction/01_data/02_stg
 :: "02_stg".base_semanticscholar_s2orcv2 created | since_start: 11.0 minutes, 1.71 seconds | since_last: 11.0 minutes, 1.71 seconds :: 


Unnamed: 0,id_semanticscholar,id_mag,id_doi,id_arxiv,title,source_url,openaccess_status,content_text,annotations_paragraph,annotations_section_header,license
0,1808709,2101143116,10.1186/1476-511x-13-132,,Twist 1 regulates the expression of PPARγ during hormone-induced 3T3-L1 preadipocyte differentiation: a possible role in obesity and associated diseases,https://pmc.ncbi.nlm.nih.gov/articles/PMC4150960,GOLD,"\nBackground\n\nObesity has become an epidemic in the human population, and China has the highest number of obese patients in the world [1]. Because obesity involves an increase in the number of adipocytes, any of the factors involved in adipocyte differentiation might be of great importance for the development of obesity. To date, numerous factors and proteins have been implicated in the generation of new fat cells, including peroxisome proliferator-activated receptor gamma (PPARγ) [2,3], C...","[{""attributes"":null,""end"":781,""start"":13},{""attributes"":null,""end"":2087,""start"":783},{""attributes"":null,""end"":3470,""start"":2089},{""attributes"":null,""end"":4312,""start"":3472},{""attributes"":null,""end"":5329,""start"":4323},{""attributes"":null,""end"":5818,""start"":5409},{""attributes"":null,""end"":6151,""start"":5890},{""attributes"":null,""end"":6744,""start"":6218},{""attributes"":null,""end"":8054,""start"":6870},{""attributes"":null,""end"":8511,""start"":8056},{""attributes"":null,""end"":9031,""start"":8525},{""attributes"":n...","[{""attributes"":null,""end"":11,""start"":1},{""attributes"":null,""end"":4321,""start"":4314},{""attributes"":null,""end"":5407,""start"":5331},{""attributes"":null,""end"":5888,""start"":5820},{""attributes"":null,""end"":6216,""start"":6153},{""attributes"":null,""end"":6868,""start"":6746},{""attributes"":null,""end"":8523,""start"":8513},{""attributes"":null,""end"":16548,""start"":16527},{""attributes"":null,""end"":16559,""start"":16550},{""attributes"":null,""end"":17519,""start"":17439},{""attributes"":null,""end"":19472,""start"":19445},{""attrib...",CCBY
1,3738414,2789005638,10.1155/2018/1042479,,PROM and Labour Effects on Urinary Metabolome: A Pilot Study,https://pmc.ncbi.nlm.nih.gov/articles/PMC5817378,GOLD,"\nIntroduction\n\nThe early diagnosis of pregnancy-related complications and the prediction of pregnancy outcome are considered strategic clinical goals to ensure the health of mothers and of their babies. Among these, premature rupture of membranes (PROM) consists of the rupture of the foetal membranes before the onset of labour. It can be observed at any gestational age [1] and occurs in approximately 10% of pregnant women and in roughly 40% of preterm deliveries [2]. Foetal membranes are ...","[{""attributes"":null,""end"":4623,""start"":15},{""attributes"":null,""end"":6006,""start"":4703},{""attributes"":null,""end"":7040,""start"":6040},{""attributes"":null,""end"":7808,""start"":7063},{""attributes"":null,""end"":9464,""start"":7836},{""attributes"":null,""end"":10835,""start"":9466},{""attributes"":null,""end"":11411,""start"":10861},{""attributes"":null,""end"":12575,""start"":11413},{""attributes"":null,""end"":12831,""start"":12585},{""attributes"":null,""end"":12984,""start"":12833},{""attributes"":null,""end"":13373,""start"":12986},{""...","[{""attributes"":{""n"":""1.""},""end"":13,""start"":1},{""attributes"":{""n"":""2.""},""end"":4646,""start"":4625},{""attributes"":{""n"":""2.1.""},""end"":4701,""start"":4648},{""attributes"":{""n"":""2.2.""},""end"":6038,""start"":6008},{""attributes"":{""n"":""2.3.""},""end"":7061,""start"":7042},{""attributes"":{""n"":""2.4.""},""end"":7834,""start"":7810},{""attributes"":{""n"":""3.""},""end"":10859,""start"":10837},{""attributes"":{""n"":""3.3.""},""end"":12583,""start"":12577},{""attributes"":{""n"":""4.""},""end"":13385,""start"":13375},{""attributes"":{""n"":""5.""},""end"":167...",CCBY
2,15710447,2741402565,10.18653/v1/w17-1005,,Word Embedding and Topic Modeling Enhanced Multiple Features for Content Linking and Argument / Sentiment Labeling in Online Forums,https://aclanthology.org/W17-1005,HYBRID,"\nIntroduction\n\nComments to news and their providers in online forums have been increasing rapidly in recent years with a large number of user participants and huge amount of interactive contents. How can we understand the mass of comments effectively? A crucial initial step towards this goal should be content linking, which is to determine what comments link to, be that either specific news snippets or comments by other users. Furthermore, a set of labels for a given link may be articulat...","[{""attributes"":null,""end"":585,""start"":15},{""attributes"":null,""end"":813,""start"":587},{""attributes"":null,""end"":1141,""start"":815},{""attributes"":null,""end"":1511,""start"":1143},{""attributes"":null,""end"":2352,""start"":1531},{""attributes"":null,""end"":3126,""start"":2363},{""attributes"":null,""end"":3193,""start"":3145},{""attributes"":null,""end"":3418,""start"":3211},{""attributes"":null,""end"":3543,""start"":3452},{""attributes"":null,""end"":3672,""start"":3545},{""attributes"":null,""end"":3984,""start"":3674},{""attributes"":nul...","[{""attributes"":{""n"":""1""},""end"":13,""start"":1},{""attributes"":{""n"":""2""},""end"":1529,""start"":1513},{""attributes"":{""n"":""3""},""end"":2361,""start"":2354},{""attributes"":{""n"":""3.1""},""end"":3143,""start"":3128},{""attributes"":{""n"":""3.1.1""},""end"":3209,""start"":3195},{""attributes"":{""n"":""3.1.2""},""end"":3450,""start"":3420},{""attributes"":{""n"":""3.1.3""},""end"":5187,""start"":5154},{""attributes"":{""n"":""3.2""},""end"":6316,""start"":6302},{""attributes"":null,""end"":7038,""start"":7024},{""attributes"":null,""end"":7173,""start"":7150},{""at...",CCBY


Unnamed: 0,c
0,11609787


 :: "02_stg".base_semanticscholar_s2orcv2 queries finished | since_start: 11.0 minutes, 11.75 seconds | since_last: 10.04 seconds :: 


' :: "02_stg".base_semanticscholar_s2orcv2 queries finished | since_start: 11.0 minutes, 11.75 seconds | since_last: 10.04 seconds :: '

In [4]:
table_name = 'stg_semanticscholar_combined_works'
timelogger = utils.TimeLogger()
utils.create_table_from_sql_file(
    database_name = '02_stg',
    table_name = table_name,
    overwrite_strategy='overwrite', # options: fail, overwrite, ignore
    wait=True,
)
timelogger.log(f'"02_stg".{table_name} created')

utils.pd_set_options(cols=500)
display(wr.athena.read_sql_query(f"""SELECT * FROM "02_stg".{table_name} LIMIT 3 """, '02_stg'))
display(wr.athena.read_sql_query(f"""SELECT COUNT(*) AS c FROM "02_stg".{table_name} """, '02_stg'))
timelogger.log(f'"02_stg".{table_name} queries finished')

 :: :: TIMELOGGER STARTED :: | since_start: 0.00 seconds | since_last: 0.00 seconds :: 
Table 02_stg.stg_semanticscholar_combined_works already exists. Overwriting since overwrite_strategy=="overwrite".
Deleting table from Glue Catalog 02_stg stg_semanticscholar_combined_works
Deleting S3 objects from s3://sagemaker-research-methodology-extraction/01_data/02_stg/stg_semanticscholar_combined_works/
s3_parent_target_path:  s3://sagemaker-research-methodology-extraction/01_data/02_stg
 :: "02_stg".stg_semanticscholar_combined_works created | since_start: 13.0 minutes, 48.32 seconds | since_last: 13.0 minutes, 48.32 seconds :: 


Unnamed: 0,id_semanticscholar,id_mag,id_doi,id_arxiv,title,source_url,openaccess_status,content_text,annotations_paragraph,annotations_section_header,license,content_abstract,publication_year,publication_date
0,220531160,3042431448.0,10.1167/iovs.61.8.16,,Quantitative Fundus Autofluorescence in Rhesus Macaques in Aging and Age-Related Drusen,https://pmc.ncbi.nlm.nih.gov/articles/PMC7425688,GOLD,"\nD iseases of the macula, such as age-related macular degeneration (AMD) and diabetic macular edema, are leading causes of visual impairment in developed countries. 1 Animal models of macular conditions can further detail the mechanisms of their pathogenesis and reveal new insights into developing novel interventions. Nonhuman primates (NHPs) are a compelling animal model for studying macular diseases as they are the only mammals beside humans to possess a true macula. NHPs, such as rhesus ...","[{""attributes"":null,""end"":1340,""start"":1},{""attributes"":null,""end"":2554,""start"":1342},{""attributes"":null,""end"":4618,""start"":2580},{""attributes"":null,""end"":6008,""start"":4640},{""attributes"":null,""end"":7798,""start"":6010},{""attributes"":null,""end"":9037,""start"":7824},{""attributes"":null,""end"":10039,""start"":9078},{""attributes"":null,""end"":11377,""start"":10063},{""attributes"":null,""end"":12469,""start"":11391},{""attributes"":null,""end"":13219,""start"":12492},{""attributes"":null,""end"":13875,""start"":13257},{""att...","[{""attributes"":null,""end"":2563,""start"":2556},{""attributes"":null,""end"":2578,""start"":2565},{""attributes"":null,""end"":4638,""start"":4620},{""attributes"":null,""end"":7822,""start"":7800},{""attributes"":null,""end"":9076,""start"":9039},{""attributes"":null,""end"":10061,""start"":10041},{""attributes"":null,""end"":11389,""start"":11379},{""attributes"":null,""end"":12478,""start"":12471},{""attributes"":null,""end"":12490,""start"":12480},{""attributes"":null,""end"":13255,""start"":13221},{""attributes"":null,""end"":14316,""start"":14284}...",CCBYNCND,"Purpose To employ quantitative fundus autofluorescence (qAF) imaging in rhesus macaques to noninvasively assess retinal pigment epithelial (RPE) lipofuscin in nonhuman primates (NHPs) as a model of aging and age-related macular degeneration (AMD). Methods The qAF imaging was performed on eyes of 26 rhesus macaques (mean age 18.8 ± 8.2 years, range 4–27 years) with normal-appearing fundus or with age-related soft drusen using a confocal scanning laser ophthalmoscope with 488 nm excitation and...",2020,2020-07-01
1,268446036,,10.1007/s40670-024-02017-9,,Humanism Rounds: A Multifaceted “Back to Bedside” Initiative to Improve Meaning at Work for Internal Medicine Residents,https://pmc.ncbi.nlm.nih.gov/articles/PMC11180076,HYBRID,"\nIntroduction\n\nBurnout affects medical residents nationwide, leading to poor resident wellbeing, career dissatisfaction, and decreased quality of patient care [1,2].The rates of burnout among residents range from 27 to 75%, with high rates noted in obstetrics and gynecology (75%), internal medicine (63%), and general surgery (40%) with the lowest rate among family medicine residents (27%) [3].Research into burnout during residency has focused on a variety of contributing factors including...","[{""attributes"":null,""end"":1032,""start"":15},{""attributes"":null,""end"":1487,""start"":1034},{""attributes"":null,""end"":1960,""start"":1489},{""attributes"":null,""end"":2269,""start"":1985},{""attributes"":null,""end"":3087,""start"":2291},{""attributes"":null,""end"":3559,""start"":3118},{""attributes"":null,""end"":4961,""start"":3612},{""attributes"":null,""end"":5991,""start"":4980},{""attributes"":null,""end"":6547,""start"":5993},{""attributes"":null,""end"":7174,""start"":6571},{""attributes"":null,""end"":7594,""start"":7185},{""attributes""...","[{""attributes"":null,""end"":13,""start"":1},{""attributes"":null,""end"":1983,""start"":1962},{""attributes"":null,""end"":2289,""start"":2271},{""attributes"":null,""end"":3103,""start"":3089},{""attributes"":null,""end"":3116,""start"":3105},{""attributes"":null,""end"":3590,""start"":3561},{""attributes"":null,""end"":3610,""start"":3592},{""attributes"":null,""end"":4978,""start"":4963},{""attributes"":null,""end"":6569,""start"":6549},{""attributes"":null,""end"":7183,""start"":7176},{""attributes"":null,""end"":7641,""start"":7596},{""attributes"":nu...",CCBY,"Introduction Burnout is an increasingly prevalent problem among resident physicians. To address this problem, the Accreditation Council on Graduate Medical Education (ACGME) created the Back to Bedside initiative, supporting resident-driven projects focused on increasing direct interactions with patients. In 2017, Baylor College of Medicine (BCM) Internal Medicine Residency received a Back to Bedside grant to develop and implement “Humanism Rounds,” a multifaceted program which sought to pro...",2024,2024-03-13
2,249401160,,10.1007/s12471-022-01700-z,,Major adverse cardiovascular events in older emergency department patients presenting with non-cardiac medical complaints,https://pmc.ncbi.nlm.nih.gov/articles/PMC9691805,GOLD,"\nIntroduction\n\nOlder patients are at high risk of adverse outcomes after an emergency department (ED) visit [1,2]. However, the risk of major adverse cardiovascular events (MACE) for older ED patients, presenting with noncardiac medical complaints, is unknown. Because preventive measures may improve outcome [3], early identification of patients at risk is highly important. \n\nBesides conventional cardiovascular risk factors, the cardiac biomarkers high-sensitivity cardiac Troponin T (hs-...","[{""attributes"":null,""end"":376,""start"":15},{""attributes"":null,""end"":783,""start"":378},{""attributes"":null,""end"":1290,""start"":785},{""attributes"":null,""end"":2491,""start"":1337},{""attributes"":null,""end"":2771,""start"":2493},{""attributes"":null,""end"":3143,""start"":2796},{""attributes"":null,""end"":4944,""start"":3162},{""attributes"":null,""end"":5900,""start"":4946},{""attributes"":null,""end"":6149,""start"":5902},{""attributes"":null,""end"":6368,""start"":6169},{""attributes"":null,""end"":6672,""start"":6385},{""attributes"":nul...","[{""attributes"":null,""end"":13,""start"":1},{""attributes"":null,""end"":1299,""start"":1292},{""attributes"":null,""end"":1335,""start"":1301},{""attributes"":null,""end"":2794,""start"":2773},{""attributes"":null,""end"":3160,""start"":3145},{""attributes"":null,""end"":6167,""start"":6151},{""attributes"":null,""end"":6383,""start"":6370},{""attributes"":null,""end"":8057,""start"":8050},{""attributes"":null,""end"":8103,""start"":8059},{""attributes"":null,""end"":9631,""start"":9566},{""attributes"":null,""end"":10054,""start"":10031},{""attributes"":...",CCBY,"The risk of major adverse cardiovascular events (MACE) for older emergency department (ED) patients presenting with non-cardiac medical complaints is unknown. To apply preventive measures timely, early identification of high-risk patients is incredibly important. We aimed at investigating the incidence of MACE within one year after their ED visit and the predictive value of high-sensitivity cardiac troponin T (hs-cTnT) and N‑terminal pro-B-type natriuretic peptide (NT-proBNP) for subsequent ...",2022,2022-06-07


Unnamed: 0,c
0,11609787


 :: "02_stg".stg_semanticscholar_combined_works queries finished | since_start: 13.0 minutes, 58.61 seconds | since_last: 10.29 seconds :: 


' :: "02_stg".stg_semanticscholar_combined_works queries finished | since_start: 13.0 minutes, 58.61 seconds | since_last: 10.29 seconds :: '

In [18]:
table_name = 'stg_semanticscholar_combined_works_content'
timelogger = utils.TimeLogger()
utils.create_table_from_sql_file(
    database_name = '02_stg',
    table_name = table_name,
    overwrite_strategy='overwrite', # options: fail, overwrite, ignore
    wait=True,
)
timelogger.log(f'"02_stg".{table_name} created')

utils.pd_set_options(cols=300)
display(wr.athena.read_sql_query(f"""SELECT * FROM "02_stg".{table_name} LIMIT 3 """, '02_stg'))
display(wr.athena.read_sql_query(f"""SELECT COUNT(*) AS c FROM "02_stg".{table_name} """, '02_stg'))
timelogger.log(f'"02_stg".{table_name} queries finished')

 :: :: TIMELOGGER STARTED :: | since_start: 0.00 seconds | since_last: 0.00 seconds :: 
Deleting S3 objects from s3://sagemaker-research-methodology-extraction/01_data/02_stg/stg_semanticscholar_combined_works_content/
s3_parent_target_path:  s3://sagemaker-research-methodology-extraction/01_data/02_stg
 :: "02_stg".stg_semanticscholar_combined_works_content created | since_start: 11.0 minutes, 20.68 seconds | since_last: 11.0 minutes, 20.68 seconds :: 


Unnamed: 0,id_semanticscholar,title,content_abstract,content_text,annotations_paragraph,annotations_section_header
0,260163299,Screening depression and anxiety in Indigenous peoples: A global scoping review,"Indigenous peoples’ worldviews are intricately interconnected and interrelated with their communities and the environments in which they live. Their worldviews also manifest in a holistic view of health and well-being, which contrasts with those of the dominant western biomedical model. However,...","\nIntroduction\n\nThe worldviews of Indigenous peoples are intricately interrelated and interconnected with those of their communities and the environments in which they live. Indigenous people conceptualise health and well-being more holistically (Gall et al., 2021) than the dominant western bi...","[{""attributes"":null,""end"":512,""start"":15},{""attributes"":null,""end"":1314,""start"":514},{""attributes"":null,""end"":2095,""start"":1316},{""attributes"":null,""end"":3698,""start"":2097},{""attributes"":null,""end"":4904,""start"":3700},{""attributes"":null,""end"":5652,""start"":4906},{""attributes"":null,""end"":6585,""star...","[{""attributes"":null,""end"":13,""start"":1},{""attributes"":null,""end"":5660,""start"":5654},{""attributes"":null,""end"":7233,""start"":7215},{""attributes"":null,""end"":11222,""start"":11207},{""attributes"":null,""end"":12489,""start"":12475},{""attributes"":null,""end"":13587,""start"":13572},{""attributes"":null,""end"":14789..."
1,112601881,Model Development of a Blast Furnace Stove,,\nIntroduction\n\nAbout one third of the world primary energy consumption is from the manufacturing industries. The iron and steel industry (ISI) is the second largest energy user and accounts for 20 % of the energy usage by the manufacturing industries [1]. Due to heavy reliance on fossil fuels...,"[{""attributes"":null,""end"":519,""start"":15},{""attributes"":null,""end"":1303,""start"":521},{""attributes"":null,""end"":1735,""start"":1305},{""attributes"":null,""end"":2270,""start"":1737},{""attributes"":null,""end"":2995,""start"":2272},{""attributes"":null,""end"":3406,""start"":2997},{""attributes"":null,""end"":3691,""star...","[{""attributes"":{""n"":""1.""},""end"":13,""start"":1},{""attributes"":{""n"":""2.""},""end"":3414,""start"":3408},{""attributes"":null,""end"":4208,""start"":4172},{""attributes"":{""n"":""2.1.""},""end"":5536,""start"":5527},{""attributes"":{""n"":""2.2.""},""end"":7243,""start"":7228},{""attributes"":{""n"":""2.3.""},""end"":7941,""start"":7909},..."
2,118573517,Double beta decay transition mechanism,"After briefly reviewing $\beta \beta$ decay as a test of the neutrino mass, I examine the nuclear structure involved in this process. Simple formulas (\`{a} la Pad\'{e}) are designed for the transition amplitudes and the general behavior of $\beta \beta$ decay amplitudes in the quasiparticle ran...",\nIntroduction\n\nThe double beta (ββ) decay is a nice example of the interrelation between the Particle Physics and the Nuclear Physics: we can get information on the properties of the neutrino and the weak interaction from the ββ decay only if we know who to deal we the nuclear structure invol...,"[{""attributes"":null,""end"":582,""start"":15},{""attributes"":null,""end"":1069,""start"":584},{""attributes"":null,""end"":1224,""start"":1071},{""attributes"":null,""end"":1310,""start"":1226},{""attributes"":null,""end"":1343,""start"":1312},{""attributes"":null,""end"":1420,""start"":1345},{""attributes"":null,""end"":1839,""star...","[{""attributes"":{""n"":""1.""},""end"":13,""start"":1},{""attributes"":{""n"":""3.""},""end"":10044,""start"":9997},{""attributes"":{""n"":""4.""},""end"":14193,""start"":14165}]"


Unnamed: 0,c
0,11609787


 :: "02_stg".stg_semanticscholar_combined_works_content queries finished | since_start: 11.0 minutes, 31.19 seconds | since_last: 10.51 seconds :: 


' :: "02_stg".stg_semanticscholar_combined_works_content queries finished | since_start: 11.0 minutes, 31.19 seconds | since_last: 10.51 seconds :: '

In [19]:
table_name = 'stg_semanticscholar_combined_works_metadata'
timelogger = utils.TimeLogger()
utils.create_table_from_sql_file(
    database_name = '02_stg',
    table_name = table_name,
    overwrite_strategy='overwrite', # options: fail, overwrite, ignore
    wait=True,
)
timelogger.log(f'"02_stg".{table_name} created')

utils.pd_set_options(cols=300)
display(wr.athena.read_sql_query(f"""SELECT * FROM "02_stg".{table_name} LIMIT 3 """, '02_stg'))
display(wr.athena.read_sql_query(f"""SELECT COUNT(*) AS c FROM "02_stg".{table_name} """, '02_stg'))
timelogger.log(f'"02_stg".{table_name} queries finished')

 :: :: TIMELOGGER STARTED :: | since_start: 0.00 seconds | since_last: 0.00 seconds :: 
Deleting S3 objects from s3://sagemaker-research-methodology-extraction/01_data/02_stg/stg_semanticscholar_combined_works_metadata/
s3_parent_target_path:  s3://sagemaker-research-methodology-extraction/01_data/02_stg
 :: "02_stg".stg_semanticscholar_combined_works_metadata created | since_start: 5.85 seconds | since_last: 5.85 seconds :: 


Unnamed: 0,id_semanticscholar,id_mag,id_doi,id_arxiv,source_url,openaccess_status,license,publication_year,publication_date
0,46791531,2755160340.0,10.1364/oe.25.023899,,https://doi.org/10.1364/OE.25.023899,GOLD,CCBY,2017,2017-10-02
1,13140185,2010414254.0,10.3390/rs6031863,,https://doi.org/10.3390/rs6031863,GOLD,CCBY,2014,2014-02-28
2,253366618,,10.1016/j.xpro.2022.101803,,https://pmc.ncbi.nlm.nih.gov/articles/PMC9641055,GOLD,CCBYNCND,2022,2022-11-04


Unnamed: 0,c
0,11609787


 :: "02_stg".stg_semanticscholar_combined_works_metadata queries finished | since_start: 15.13 seconds | since_last: 9.28 seconds :: 


' :: "02_stg".stg_semanticscholar_combined_works_metadata queries finished | since_start: 15.13 seconds | since_last: 9.28 seconds :: '

In [5]:
table_name = 'base_arxiv_metadata'
timelogger = utils.TimeLogger()
utils.create_table_from_sql_file(
    database_name = '02_stg',
    table_name = table_name,
    overwrite_strategy='overwrite', # options: fail, overwrite, ignore
    wait=True,
)
timelogger.log(f'"02_stg".{table_name} created')

utils.pd_set_options(cols=500)
display(wr.athena.read_sql_query(f"""SELECT * FROM "02_stg".{table_name} LIMIT 3 """, '02_stg'))
display(wr.athena.read_sql_query(f"""SELECT COUNT(*) AS c FROM "02_stg".{table_name} """, '02_stg'))
timelogger.log(f'"02_stg".{table_name} queries finished')

 :: :: TIMELOGGER STARTED :: | since_start: 0.00 seconds | since_last: 0.00 seconds :: 
Table 02_stg.base_arxiv_metadata already exists. Overwriting since overwrite_strategy=="overwrite".
Deleting table from Glue Catalog 02_stg base_arxiv_metadata
Deleting S3 objects from s3://sagemaker-research-methodology-extraction/01_data/02_stg/base_arxiv_metadata/
s3_parent_target_path:  s3://sagemaker-research-methodology-extraction/01_data/02_stg
 :: "02_stg".base_arxiv_metadata created | since_start: 15.44 seconds | since_last: 15.44 seconds :: 


Unnamed: 0,id_arxiv,id_doi,title,abstract,license
0,903.1601,,Parabolic-Dish Solar Concentrators of Film on Foam,"Parabolic and spherical mirrors are constructed of aluminized PET polyester film on urethane foam. During construction, the chosen shape of the mirror is created by manipulating the elastic/plastic behavior of the film with air pressure. Foam is then applied to the film and, once hardened, air pressure is removed. At an f-number of 0.68, preliminary models have an optical angular spread of less than 0.25 degrees, a factor of 3.3 smaller than that for a perfectly spherical mirror. The possi...",ArXiv nonexclusive-distrib
1,903.1604,10.3842/SIGMA.2009.029,Limits of Gaudin Systems: Classical and Quantum Cases,"We consider the XXX homogeneous Gaudin system with $N$ sites, both in classical and the quantum case. In particular we show that a suitable limiting procedure for letting the poles of its Lax matrix collide can be used to define new families of Liouville integrals (in the classical case) and new ""Gaudin"" algebras (in the quantum case). We will especially treat the case of total collisions, that gives rise to (a generalization of) the so called Bending flows of Kapovich and Millson. Some as...",CCBYNCSA
2,903.16,,Typically Real Harmonic Functions,We consider a class $\THO$ of typically real harmonic functions on the unit disk that contains the class of normalized analytic and typically real functions. We also obtain some partial results about the region of univalence for this class.,ArXiv nonexclusive-distrib


Unnamed: 0,c
0,2816721


 :: "02_stg".base_arxiv_metadata queries finished | since_start: 24.78 seconds | since_last: 9.35 seconds :: 


' :: "02_stg".base_arxiv_metadata queries finished | since_start: 24.78 seconds | since_last: 9.35 seconds :: '

In [21]:
table_name = 'stg_unified_works_metadata_01_joined_to_arxiv'
timelogger = utils.TimeLogger()
utils.create_table_from_sql_file(
    database_name = '02_stg',
    table_name = table_name,
    overwrite_strategy='overwrite', # options: fail, overwrite, ignore
    wait=True,
)
timelogger.log(f'"02_stg".{table_name} created')

utils.pd_set_options(cols=300)
display(wr.athena.read_sql_query(f"""SELECT * FROM "02_stg".{table_name} LIMIT 3 """, '02_stg'))
display(wr.athena.read_sql_query(f"""SELECT COUNT(*) AS c FROM "02_stg".{table_name} """, '02_stg'))
timelogger.log(f'"02_stg".{table_name} queries finished')

 :: :: TIMELOGGER STARTED :: | since_start: 0.00 seconds | since_last: 0.00 seconds :: 
Deleting S3 objects from s3://sagemaker-research-methodology-extraction/01_data/02_stg/stg_unified_works_metadata_01_joined_to_arxiv/
s3_parent_target_path:  s3://sagemaker-research-methodology-extraction/01_data/02_stg
 :: "02_stg".stg_unified_works_metadata_01_joined_to_arxiv created | since_start: 11.79 seconds | since_last: 11.79 seconds :: 


Unnamed: 0,id_semanticscholar,id_mag,id_doi,id_arxiv,source_url,openaccess_status,publication_year,publication_date,license,license_allows_derivative_reuse
0,4951032,2789407911,10.1016/j.tecto.2018.03.010,,https://doi.org/10.1016/J.TECTO.2018.03.010,HYBRID,2018,2018-04-22,CCBY,1
1,204923380,2981732074,10.1016/j.jalz.2019.08.201,,https://pmc.ncbi.nlm.nih.gov/articles/PMC7012375,HYBRID,2019,2019-10-28,CCBYNCND,0
2,14519185,2114693455,10.1099/vir.0.007377-0,,https://pmc.ncbi.nlm.nih.gov/articles/PMC2885064,HYBRID,2009,2009-03-01,CCBY,1


Unnamed: 0,c
0,11609787


 :: "02_stg".stg_unified_works_metadata_01_joined_to_arxiv queries finished | since_start: 21.14 seconds | since_last: 9.35 seconds :: 


' :: "02_stg".stg_unified_works_metadata_01_joined_to_arxiv queries finished | since_start: 21.14 seconds | since_last: 9.35 seconds :: '

In [15]:
table_name = 'stg_unified_works_metadata_02_joined_to_openalex'
timelogger = utils.TimeLogger()
utils.create_table_from_sql_file(
    database_name = '02_stg',
    table_name = table_name,
    overwrite_strategy='overwrite', # options: fail, overwrite, ignore
    wait=True,
)
timelogger.log(f'"02_stg".{table_name} created')

utils.pd_set_options(cols=300)
display(wr.athena.read_sql_query(f"""SELECT * FROM "02_stg".{table_name} LIMIT 3 """, '02_stg'))
display(wr.athena.read_sql_query(f"""SELECT COUNT(*) AS c FROM "02_stg".{table_name} """, '02_stg'))
timelogger.log(f'"02_stg".{table_name} queries finished')

 :: :: TIMELOGGER STARTED :: | since_start: 0.00 seconds | since_last: 0.00 seconds :: 
Deleting S3 objects from s3://sagemaker-research-methodology-extraction/01_data/02_stg/stg_unified_works_metadata_02_joined_to_openalex/
s3_parent_target_path:  s3://sagemaker-research-methodology-extraction/01_data/02_stg
 :: "02_stg".stg_unified_works_metadata_02_joined_to_openalex created | since_start: 9.75 seconds | since_last: 9.75 seconds :: 


Unnamed: 0,id_semanticscholar,id_mag,id_doi,id_arxiv,publication_year,publication_date,license,license_allows_derivative_reuse,source_url,has_id_mag,has_id_doi,has_id_mag_or_doi,openalex_id_openalex,openalex_id_doi,openalex_language,openalex_primary_topic_id,openalex_primary_topic_display_name,openalex_primary_topic_subfield_id,openalex_primary_topic_subfield_display_name,openalex_primary_topic_field_id,openalex_primary_topic_field_display_name,openalex_primary_topic_domain_id,openalex_primary_topic_domain_display_name,openalex_joined_on
0,239285849,,10.3991/ijet.v16i18.24251,,2021,2021-09-20,CCBY,1,https://doi.org/10.3991/ijet.v16i18.24251,0,1,1,3200054761,10.3991/ijet.v16i18.24251,en,11122,Online Learning and Analytics,1706,Computer Science Applications,17,Computer Science,3,Physical Sciences,doi
1,248062035,,10.3390/rs14071753,,2022,2022-04-06,CCBY,1,https://doi.org/10.3390/rs14071753,0,1,1,4224245753,10.3390/rs14071753,en,11276,Solar Radiation and Photovoltaics,1702,Artificial Intelligence,17,Computer Science,3,Physical Sciences,doi
2,269481513,,10.1109/tvcg.2024.3395365,,2024,2024-04-30,CCBYNCND,0,https://doi.org/10.1109/TVCG.2024.3395365,0,1,1,4396523158,10.1109/tvcg.2024.3395365,en,10734,Information and Cyber Security,1710,Information Systems,17,Computer Science,3,Physical Sciences,doi


Unnamed: 0,c
0,833261


 :: "02_stg".stg_unified_works_metadata_02_joined_to_openalex queries finished | since_start: 19.15 seconds | since_last: 9.40 seconds :: 


' :: "02_stg".stg_unified_works_metadata_02_joined_to_openalex queries finished | since_start: 19.15 seconds | since_last: 9.40 seconds :: '

In [16]:
table_name = 'stg_unified_works_metadata_03_filtered_and_tagged'
db_name = '02_stg'
timelogger = utils.TimeLogger()
utils.create_table_from_sql_file(
    database_name = db_name,
    table_name = table_name,
    overwrite_strategy='overwrite', # options: fail, overwrite, ignore
    wait=True,
)
timelogger.log(f'"{db_name}".{table_name} created')

utils.pd_set_options(cols=300)
display(wr.athena.read_sql_query(f"""SELECT * FROM "{db_name}".{table_name} LIMIT 3 """, db_name))
display(wr.athena.read_sql_query(f"""SELECT COUNT(*) AS c FROM "{db_name}".{table_name} """, db_name))
timelogger.log(f'"{db_name}".{table_name} queries finished')

 :: :: TIMELOGGER STARTED :: | since_start: 0.00 seconds | since_last: 0.00 seconds :: 
Table 02_stg.stg_unified_works_metadata_03_filtered_and_tagged already exists. Overwriting since overwrite_strategy=="overwrite".
Deleting table from Glue Catalog 02_stg stg_unified_works_metadata_03_filtered_and_tagged
Deleting S3 objects from s3://sagemaker-research-methodology-extraction/01_data/02_stg/stg_unified_works_metadata_03_filtered_and_tagged/
s3_parent_target_path:  s3://sagemaker-research-methodology-extraction/01_data/02_stg
 :: "02_stg".stg_unified_works_metadata_03_filtered_and_tagged created | since_start: 6.93 seconds | since_last: 6.93 seconds :: 


Unnamed: 0,id_semanticscholar,id_mag,id_doi,id_arxiv,publication_year,publication_date,license,license_allows_derivative_reuse,source_url,has_id_mag,has_id_doi,has_id_mag_or_doi,openalex_id_openalex,openalex_id_doi,openalex_language,openalex_primary_topic_id,openalex_primary_topic_display_name,openalex_primary_topic_subfield_id,openalex_primary_topic_subfield_display_name,openalex_primary_topic_field_id,openalex_primary_topic_field_display_name,openalex_primary_topic_domain_id,openalex_primary_topic_domain_display_name,openalex_joined_on,bucket_10p,subset
0,2557485,2054117411.0,10.1155/2013/913038,,2013,2013-07-18,CCBY,1,https://pmc.ncbi.nlm.nih.gov/articles/PMC3732635,1,1,1,2054117411,10.1155/2013/913038,en,10054,Parallel Computing and Optimization Techniques,1708,Hardware and Architecture,17,Computer Science,3,Physical Sciences,doi,1,test
1,272597057,,10.46586/tches.v2024.i4.231-257,,2024,,CCBY,1,https://doi.org/10.46586/tches.v2024.i4.231-257,0,1,1,4402807365,10.46586/tches.v2024.i4.231-257,en,10054,Parallel Computing and Optimization Techniques,1708,Hardware and Architecture,17,Computer Science,3,Physical Sciences,doi,1,test
2,7807512,1989540889.0,10.1155/1995/278064,,1995,,CCBY,1,https://doi.org/10.1155/1995/278064,1,1,1,1989540889,10.1155/1995/278064,en,10054,Parallel Computing and Optimization Techniques,1708,Hardware and Architecture,17,Computer Science,3,Physical Sciences,doi,1,test


Unnamed: 0,c
0,396052


 :: "02_stg".stg_unified_works_metadata_03_filtered_and_tagged queries finished | since_start: 15.58 seconds | since_last: 8.65 seconds :: 


' :: "02_stg".stg_unified_works_metadata_03_filtered_and_tagged queries finished | since_start: 15.58 seconds | since_last: 8.65 seconds :: '

In [17]:
table_name = 'stg_topics'
db_name = '02_stg'
timelogger = utils.TimeLogger()
utils.create_table_from_sql_file(
    database_name = db_name,
    table_name = table_name,
    overwrite_strategy='overwrite', # options: fail, overwrite, ignore
    wait=True,
)
timelogger.log(f'"{db_name}".{table_name} created')

utils.pd_set_options(cols=300)
display(wr.athena.read_sql_query(f"""SELECT * FROM "{db_name}".{table_name} ORDER BY openalex_primary_topic_count DESC LIMIT 10 """, db_name))
display(wr.athena.read_sql_query(f"""SELECT COUNT(*) AS c FROM "{db_name}".{table_name} """, db_name))
timelogger.log(f'"{db_name}".{table_name} queries finished')

 :: :: TIMELOGGER STARTED :: | since_start: 0.00 seconds | since_last: 0.00 seconds :: 
Table 02_stg.stg_topics already exists. Overwriting since overwrite_strategy=="overwrite".
Deleting table from Glue Catalog 02_stg stg_topics
Deleting S3 objects from s3://sagemaker-research-methodology-extraction/01_data/02_stg/stg_topics/
s3_parent_target_path:  s3://sagemaker-research-methodology-extraction/01_data/02_stg
 :: "02_stg".stg_topics created | since_start: 5.76 seconds | since_last: 5.76 seconds :: 


Unnamed: 0,openalex_primary_topic_index,openalex_primary_topic_id,openalex_primary_topic_display_name,openalex_primary_topic_count,openalex_primary_topic_percent,openalex_primary_topic_subfield_id,openalex_primary_topic_subfield_display_name
0,6,10036,Advanced Neural Network Applications,6191,1.563179,1707,Computer Vision and Pattern Recognition


Unnamed: 0,c
0,1


 :: "02_stg".stg_topics queries finished | since_start: 12.94 seconds | since_last: 7.18 seconds :: 


' :: "02_stg".stg_topics queries finished | since_start: 12.94 seconds | since_last: 7.18 seconds :: '

In [18]:
table_name = 'stg_subfields'
db_name = '02_stg'
timelogger = utils.TimeLogger()
utils.create_table_from_sql_file(
    database_name = db_name,
    table_name = table_name,
    overwrite_strategy='overwrite', # options: fail, overwrite, ignore
    wait=True,
)
timelogger.log(f'"{db_name}".{table_name} created')

utils.pd_set_options(cols=300)
display(wr.athena.read_sql_query(f"""SELECT * FROM "{db_name}".{table_name} ORDER BY openalex_primary_topic_subfield_count DESC LIMIT 300 """, db_name))
display(wr.athena.read_sql_query(f"""SELECT COUNT(*) AS c FROM "{db_name}".{table_name} """, db_name))
timelogger.log(f'"{db_name}".{table_name} queries finished')

 :: :: TIMELOGGER STARTED :: | since_start: 0.00 seconds | since_last: 0.00 seconds :: 
Table 02_stg.stg_subfields already exists. Overwriting since overwrite_strategy=="overwrite".
Deleting table from Glue Catalog 02_stg stg_subfields
Deleting S3 objects from s3://sagemaker-research-methodology-extraction/01_data/02_stg/stg_subfields/
s3_parent_target_path:  s3://sagemaker-research-methodology-extraction/01_data/02_stg
 :: "02_stg".stg_subfields created | since_start: 3.52 seconds | since_last: 3.52 seconds :: 


Unnamed: 0,openalex_primary_topic_subfield_index,openalex_primary_topic_subfield_id,openalex_primary_topic_subfield_display_name,openalex_primary_topic_subfield_count,openalex_primary_topic_subfield_percent
0,0,1707,Computer Vision and Pattern Recognition,6191,100.0


Unnamed: 0,c
0,1


 :: "02_stg".stg_subfields queries finished | since_start: 12.40 seconds | since_last: 8.88 seconds :: 


' :: "02_stg".stg_subfields queries finished | since_start: 12.40 seconds | since_last: 8.88 seconds :: '

In [19]:
table_name = 'stg_unified_works_metadata_04_with_topics_and_subfields'
db_name = '02_stg'
timelogger = utils.TimeLogger()
utils.create_table_from_sql_file(
    database_name = db_name,
    table_name = table_name,
    overwrite_strategy='overwrite', # options: fail, overwrite, ignore
    wait=True,
)
timelogger.log(f'"{db_name}".{table_name} created')

utils.pd_set_options(cols=300)
display(wr.athena.read_sql_query(f"""SELECT * FROM "{db_name}".{table_name} ORDER BY RANDOM() LIMIT 3 """, db_name))
display(wr.athena.read_sql_query(f"""SELECT COUNT(*) AS c FROM "{db_name}".{table_name} """, db_name))
timelogger.log(f'"{db_name}".{table_name} queries finished')

 :: :: TIMELOGGER STARTED :: | since_start: 0.00 seconds | since_last: 0.00 seconds :: 
Table 02_stg.stg_unified_works_metadata_04_with_topics_and_subfields already exists. Overwriting since overwrite_strategy=="overwrite".
Deleting table from Glue Catalog 02_stg stg_unified_works_metadata_04_with_topics_and_subfields
Deleting S3 objects from s3://sagemaker-research-methodology-extraction/01_data/02_stg/stg_unified_works_metadata_04_with_topics_and_subfields/
s3_parent_target_path:  s3://sagemaker-research-methodology-extraction/01_data/02_stg
 :: "02_stg".stg_unified_works_metadata_04_with_topics_and_subfields created | since_start: 7.94 seconds | since_last: 7.94 seconds :: 


Unnamed: 0,id_semanticscholar,id_mag,id_doi,id_arxiv,publication_year,publication_date,license,license_allows_derivative_reuse,source_url,has_id_mag,has_id_doi,has_id_mag_or_doi,openalex_id_openalex,openalex_id_doi,openalex_language,openalex_primary_topic_id,openalex_primary_topic_display_name,openalex_primary_topic_count,openalex_primary_topic_index,openalex_primary_topic_subfield_id,openalex_primary_topic_subfield_display_name,openalex_primary_topic_subfield_count,openalex_primary_topic_subfield_index,openalex_primary_topic_field_id,openalex_primary_topic_field_display_name,openalex_primary_topic_domain_id,openalex_primary_topic_domain_display_name,openalex_joined_on,subset
0,28698116,2283235182.0,10.1155/2016/1894713,,2016,2016-01-01,CCBY,1,https://doi.org/10.1155/2016/1894713,1,1,1,2283235182,10.1155/2016/1894713,en,11614,Cloud Data Security Solutions,,,1710,Information Systems,,,17,Computer Science,3,Physical Sciences,doi,train
1,249263338,,10.3390/s22114109,,2022,2022-05-28,CCBY,1,https://pmc.ncbi.nlm.nih.gov/articles/PMC9185275,0,1,1,4281615113,10.3390/s22114109,en,10812,Human Pose and Action Recognition,,,1707,Computer Vision and Pattern Recognition,6191.0,0.0,17,Computer Science,3,Physical Sciences,doi,test
2,231824198,,10.1109/access.2021.3054952,,2021,,CCBY,1,https://doi.org/10.1109/ACCESS.2021.3054952,0,1,1,3124383445,10.1109/access.2021.3054952,en,11017,Chaos-based Image/Signal Encryption,,,1707,Computer Vision and Pattern Recognition,6191.0,0.0,17,Computer Science,3,Physical Sciences,doi,train


Unnamed: 0,c
0,396052


 :: "02_stg".stg_unified_works_metadata_04_with_topics_and_subfields queries finished | since_start: 19.18 seconds | since_last: 11.24 seconds :: 


' :: "02_stg".stg_unified_works_metadata_04_with_topics_and_subfields queries finished | since_start: 19.18 seconds | since_last: 11.24 seconds :: '

In [20]:
table_name = 'stg_unified_works_filtered'
timelogger = utils.TimeLogger()
utils.create_table_from_sql_file(
    database_name = '02_stg',
    table_name = table_name,
    overwrite_strategy='overwrite', # options: fail, overwrite, ignore
    wait=True,
)
timelogger.log(f'"02_stg".{table_name} created')

utils.pd_set_options(cols=300)
display(wr.athena.read_sql_query(f"""SELECT * FROM "02_stg".{table_name} LIMIT 3 """, '02_stg'))
display(wr.athena.read_sql_query(f"""SELECT COUNT(*) AS c FROM "02_stg".{table_name} """, '02_stg'))
timelogger.log(f'"02_stg".{table_name} queries finished')

 :: :: TIMELOGGER STARTED :: | since_start: 0.00 seconds | since_last: 0.00 seconds :: 
Table 02_stg.stg_unified_works_filtered already exists. Overwriting since overwrite_strategy=="overwrite".
Deleting table from Glue Catalog 02_stg stg_unified_works_filtered
Deleting S3 objects from s3://sagemaker-research-methodology-extraction/01_data/02_stg/stg_unified_works_filtered/
s3_parent_target_path:  s3://sagemaker-research-methodology-extraction/01_data/02_stg
 :: "02_stg".stg_unified_works_filtered created | since_start: 1.0 minute, 27.29 seconds | since_last: 1.0 minute, 27.29 seconds :: 


Unnamed: 0,id_semanticscholar,id_mag,id_doi,id_arxiv,publication_year,publication_date,license,license_allows_derivative_reuse,source_url,has_id_mag,has_id_doi,has_id_mag_or_doi,openalex_id_openalex,openalex_id_doi,openalex_language,openalex_primary_topic_id,openalex_primary_topic_display_name,openalex_primary_topic_count,openalex_primary_topic_index,openalex_primary_topic_subfield_id,openalex_primary_topic_subfield_display_name,openalex_primary_topic_subfield_count,openalex_primary_topic_subfield_index,openalex_primary_topic_field_id,openalex_primary_topic_field_display_name,openalex_primary_topic_domain_id,openalex_primary_topic_domain_display_name,openalex_joined_on,title,content_abstract,content_text,annotations_paragraph,annotations_section_header,subset
0,240129151,,10.5755/j02.eie.29648,,2021,2021-10-27,CCBY,1,https://doi.org/10.5755/j02.eie.29648,0,1,1,3211154457,10.5755/j02.eie.29648,en,10057,Face and Expression Recognition,,,1707,Computer Vision and Pattern Recognition,6191,0,17,Computer Science,3,Physical Sciences,doi,A Novel Fuzzy Optimized CNN-RNN Method for Facial Expression Recognition,"Facial expression is one of the important ways of transferring emotion in interpersonal communication, and it has been widely used in many interpersonal communication systems. The traditional facial expression recognition methods are not intelligent enough to manage the model uncertainty. The de...","\nI. INTRODUCTION\n\nExpression is one of the most important ways to convey emotions in interpersonal communication. It mainly refers to the formation of facial muscles and facial features. Common expressions include anger, disgust, fear, joy, surprise, etc. [1]. People can get their emotions di...","[{""attributes"":null,""end"":621,""start"":18},{""attributes"":null,""end"":1689,""start"":623},{""attributes"":null,""end"":4630,""start"":1691},{""attributes"":null,""end"":4814,""start"":4632},{""attributes"":null,""end"":5175,""start"":4816},{""attributes"":null,""end"":5963,""start"":5177},{""attributes"":null,""end"":6273,""star...","[{""attributes"":null,""end"":16,""start"":1},{""attributes"":null,""end"":6312,""start"":6275},{""attributes"":null,""end"":6353,""start"":6314},{""attributes"":null,""end"":7062,""start"":7038},{""attributes"":{""n"":""1.""},""end"":7327,""start"":7313},{""attributes"":{""n"":""2.""},""end"":8405,""start"":8390},{""attributes"":{""n"":""3.""}...",validation
1,140112774,2942774952.0,10.1007/s10994-019-05795-1,,2019,2019-05-09,CCBY,1,https://doi.org/10.1007/s10994-019-05795-1,1,1,1,2942774952,10.1007/s10994-019-05795-1,en,10057,Face and Expression Recognition,,,1707,Computer Vision and Pattern Recognition,6191,0,17,Computer Science,3,Physical Sciences,doi,Efficient feature selection using shrinkage estimators,"Information theoretic feature selection methods quantify the importance of each feature by estimating mutual information terms to capture: the relevancy, the redundancy and the complementarity. These terms are commonly estimated by maximum likelihood, while an under-explored area of research is ...","\nFor this proof we will use Ledoit and Wolf theorem (Ledoit and Wolf, 2003), which derives an analytical expression for the optimal shrinkage intensity that guarantees minimal MSE. Using the fact that pML (xy) is an unbiased estimator of p(xy), the optimal shrinkage intensity takes the followin...","[{""attributes"":null,""end"":934,""start"":1},{""attributes"":null,""end"":992,""start"":936},{""attributes"":null,""end"":1013,""start"":994},{""attributes"":null,""end"":1092,""start"":1015},{""attributes"":null,""end"":1152,""start"":1094},{""attributes"":null,""end"":1256,""start"":1154},{""attributes"":null,""end"":1317,""start"":...","[{""attributes"":null,""end"":4122,""start"":4100},{""attributes"":null,""end"":4633,""start"":4611},{""attributes"":null,""end"":6139,""start"":6104},{""attributes"":null,""end"":6256,""start"":6241},{""attributes"":null,""end"":6463,""start"":6441}]",validation
2,221397204,3081792903.0,10.1109/access.2022.3155233,2009.00505,2020,2020-09-01,CCBY,1,https://arxiv.org/abs/2009.00505,1,1,1,4214697132,10.1109/access.2022.3155233,en,10057,Face and Expression Recognition,,,1707,Computer Vision and Pattern Recognition,6191,0,17,Computer Science,3,Physical Sciences,doi,Graph Embedding with Data Uncertainty,"Spectral-based subspace learning is a common data preprocessing step in many machine learning pipelines. The main aim is to learn a meaningful low dimensional embedding of the data. However, most subspace learning methods do not take into consideration possible measurement inaccuracies or artifa...","\nIntroduction\n\nWith the advancement of data collection processes, high dimensional data are available for applying machine learning approaches. However, the impracticability of working in high dimensional spaces due to the curse of dimensionality and the realization that the data in many prob...","[{""attributes"":null,""end"":1076,""start"":15},{""attributes"":null,""end"":2001,""start"":1078},{""attributes"":null,""end"":4046,""start"":2003},{""attributes"":null,""end"":4329,""start"":4048},{""attributes"":null,""end"":4459,""start"":4331},{""attributes"":null,""end"":4720,""start"":4461},{""attributes"":null,""end"":4907,""st...","[{""attributes"":{""n"":""1.""},""end"":13,""start"":1},{""attributes"":{""n"":""2.""},""end"":5308,""start"":5296},{""attributes"":{""n"":""2.1.""},""end"":5325,""start"":5310},{""attributes"":{""n"":""2.2.""},""end"":8284,""start"":8259},{""attributes"":{""n"":""3.""},""end"":10392,""start"":10355},{""attributes"":{""n"":""3.1.""},""end"":13051,""star...",test


Unnamed: 0,c
0,396052


 :: "02_stg".stg_unified_works_filtered queries finished | since_start: 1.0 minute, 35.51 seconds | since_last: 8.22 seconds :: 


' :: "02_stg".stg_unified_works_filtered queries finished | since_start: 1.0 minute, 35.51 seconds | since_last: 8.22 seconds :: '

In [21]:
table_name = 'unified_works'
db_name = '03_core'
timelogger = utils.TimeLogger()
utils.create_table_from_sql_file(
    database_name = db_name,
    table_name = table_name,
    overwrite_strategy='overwrite', # options: fail, overwrite, ignore
    wait=True,
    s3_parent_target_path=config.S3_CORE_DATA_PATH
)
timelogger.log(f'"{db_name}".{table_name} created')

utils.pd_set_options(cols=100)
display(wr.athena.read_sql_query(f"""SELECT * FROM "{db_name}".{table_name} LIMIT 3 """, db_name))
display(wr.athena.read_sql_query(f"""SELECT COUNT(*) AS c FROM "{db_name}".{table_name} """, db_name))
timelogger.log(f'"{db_name}".{table_name} queries finished')

 :: :: TIMELOGGER STARTED :: | since_start: 0.00 seconds | since_last: 0.00 seconds :: 
Table 03_core.unified_works already exists. Overwriting since overwrite_strategy=="overwrite".
Deleting table from Glue Catalog 03_core unified_works
Deleting S3 objects from s3://sagemaker-research-methodology-extraction/01_data/03_core/unified_works/
s3_parent_target_path:  s3://sagemaker-research-methodology-extraction/01_data/03_core
 :: "03_core".unified_works created | since_start: 39.76 seconds | since_last: 39.76 seconds :: 


Unnamed: 0,id,topic_index,topic_display_name,subfield_index,subfield_display_name,title,abstract,fulltext,subset
0,271618904,,Computational Drug Discovery Methods,,Computational Theory and Mathematics,Editorial: Pharmacoinformatics: new developments and challenges in drug design,"ﬁ cacy, and mechanism of action. One",\nEditorial on the Research Topic Pharmacoinformatics: new developments and challenges in drug d...,train
1,10189194,,Distributed Control Multi-Agent Systems,,Computer Networks and Communications,Collaborative emitter tracking using Rao-Blackwellized random exchange diffusion particle filtering,"We introduce in this paper the fully distributed, random exchange diffusion particle filter (ReD...","\nIntroduction\n\nIn several engineering applications, e.g., target tracking or fault detection,...",train
2,14599631,,Computational Drug Discovery Methods,,Computational Theory and Mathematics,A Network-Based Target Overlap Score for Characterizing Drug Combinations: High Correlation with...,Drug combinations are highly efficient in systemic treatment of complex multigene diseases such ...,\nIntroduction\n\nIn the past few decades the number of novel marketed drugs has fallen much bel...,train


Unnamed: 0,c
0,396052


 :: "03_core".unified_works queries finished | since_start: 48.44 seconds | since_last: 8.68 seconds :: 


' :: "03_core".unified_works queries finished | since_start: 48.44 seconds | since_last: 8.68 seconds :: '

In [22]:
timelogger = utils.TimeLogger()
db_name = '03_core'
table_names = ['unified_works_train', 'unified_works_test', 'unified_works_validation']
for table_name in table_names:
    utils.create_table_from_sql_file(
        database_name = db_name,
        table_name = table_name,
        overwrite_strategy='overwrite', # options: fail, overwrite, ignore
        wait=True,
        s3_parent_target_path=config.S3_CORE_DATA_PATH
    )
    timelogger.log(f'"{db_name}".{table_name} created')
    
    utils.pd_set_options(cols=100)
    display(wr.athena.read_sql_query(f"""SELECT * FROM "{db_name}".{table_name} LIMIT 3 """, db_name))
    display(wr.athena.read_sql_query(f"""SELECT COUNT(*) AS c FROM "{db_name}".{table_name} """, db_name))
    timelogger.log(f'"{db_name}".{table_name} queries finished')

 :: :: TIMELOGGER STARTED :: | since_start: 0.00 seconds | since_last: 0.00 seconds :: 
Table 03_core.unified_works_train already exists. Overwriting since overwrite_strategy=="overwrite".
Deleting table from Glue Catalog 03_core unified_works_train
Deleting S3 objects from s3://sagemaker-research-methodology-extraction/01_data/03_core/unified_works_train/
s3_parent_target_path:  s3://sagemaker-research-methodology-extraction/01_data/03_core
 :: "03_core".unified_works_train created | since_start: 35.74 seconds | since_last: 35.74 seconds :: 


Unnamed: 0,id,topic_index,topic_display_name,subfield_index,subfield_display_name,title,abstract,fulltext,subset
0,268178951,,Quantum Computing Algorithms and Architecture,,Artificial Intelligence,A framework for demonstrating practical quantum advantage: comparing quantum against classical g...,"Generative modeling has seen a rising interest in both classical and quantum machine learning, a...",\nI. INTRODUCTION\n\nGenerative modeling has become more widely popular with its remarkable succ...,train
1,225076354,,Quantum Computing Algorithms and Architecture,,Artificial Intelligence,Achieving a quantum smart workforce,Interest in building dedicated quantum information science and engineering (QISE) education prog...,\nINTRODUCTION\n\nThe meteoric rise of interest in Quantum Information Science and Engineering (...,train
2,244527525,,Quantum Computing Algorithms and Architecture,,Artificial Intelligence,Quantum state preparation by adiabatic evolution with custom gates,Quantum state preparation by adiabatic evolution is currently rendered ineffective by the long i...,\nI. INTRODUCTION\n\nQuantum computing holds the key to efficiently solving problems that are in...,train


Unnamed: 0,c
0,316586


 :: "03_core".unified_works_train queries finished | since_start: 43.99 seconds | since_last: 8.25 seconds :: 
Table 03_core.unified_works_test already exists. Overwriting since overwrite_strategy=="overwrite".
Deleting table from Glue Catalog 03_core unified_works_test
Deleting S3 objects from s3://sagemaker-research-methodology-extraction/01_data/03_core/unified_works_test/
s3_parent_target_path:  s3://sagemaker-research-methodology-extraction/01_data/03_core
 :: "03_core".unified_works_test created | since_start: 52.19 seconds | since_last: 8.20 seconds :: 


Unnamed: 0,id,topic_index,topic_display_name,subfield_index,subfield_display_name,title,abstract,fulltext,subset
0,231732546,,Cooperative Communication and Network Coding,,Computer Networks and Communications,Constructions of Orbit Codes Based on Unitary Spaces Over Finite Fields,"Orbit codes, as special constant dimension codes, have attracted much attention due to their app...",\nI. INTRODUCTION\n\nRandom network coding plays an important role in coding theory for its high...,test
1,85464220,,Multimodal Machine Learning Applications,0.0,Computer Vision and Pattern Recognition,Description Generation for Remote Sensing Images Using Attribute Attention Mechanism,Image captioning generates a semantic description of an image. It deals with image understanding...,"\nIntroduction\n\nWith the development of computers and sensors, modern remote sensing technolog...",test
2,36387479,,Matrix Theory and Algorithms,,Computational Theory and Mathematics,A Parameterized Splitting Preconditioner for Generalized Saddle Point Problems,"By using Sherman-Morrison-Woodbury formula, we introduce a preconditioner based on parameterized...","\nIntroduction\n\nIn some scientific and engineering applications, such as finite element method...",test


Unnamed: 0,c
0,39753


 :: "03_core".unified_works_test queries finished | since_start: 1.0 minute, 1.98 seconds | since_last: 9.80 seconds :: 
Table 03_core.unified_works_validation already exists. Overwriting since overwrite_strategy=="overwrite".
Deleting table from Glue Catalog 03_core unified_works_validation
Deleting S3 objects from s3://sagemaker-research-methodology-extraction/01_data/03_core/unified_works_validation/
s3_parent_target_path:  s3://sagemaker-research-methodology-extraction/01_data/03_core
 :: "03_core".unified_works_validation created | since_start: 1.0 minute, 10.83 seconds | since_last: 8.85 seconds :: 


Unnamed: 0,id,topic_index,topic_display_name,subfield_index,subfield_display_name,title,abstract,fulltext,subset
0,277531389,,Wireless Signal Modulation Classification,,Artificial Intelligence,KAN-ResNet-Enhanced Radio Frequency Fingerprint Identification with Zero-Forcing Equalization,Radio Frequency Fingerprint Identification (RFFI) is a promising device authentication technique...,\nIntroduction\n\nRadio Frequency Fingerprint Identification (RFFI) technology differentiates de...,validation
1,258672859,,Virtual Reality Applications and Impacts,,Human-Computer Interaction,Using Immersive Virtual Reality in an Online Biology Course,"Interest in virtual reality (VR) for teaching and learning in higher education is growing, given...",\nIntroduction\n\nVirtual reality (VR) technology is increasingly being considered by educators ...,validation
2,221077930,,Virtual Reality Applications and Impacts,,Human-Computer Interaction,The Developing Bodily Self: How Posture Constrains Body Representation in Childhood.,Adults' body representation is constrained by multisensory information and knowledge of the body...,"\nThe feeling of inhabiting a body is fundamental for self-experience (Kilteni, Groten, & Slater...",validation


Unnamed: 0,c
0,39713


 :: "03_core".unified_works_validation queries finished | since_start: 1.0 minute, 19.25 seconds | since_last: 8.42 seconds :: 


In [23]:
display(wr.athena.read_sql_query(f"""
    SELECT 
        subset,
        COUNT(*) AS c,
        COUNT(*) * 100.0 / (SELECT COUNT(*) FROM "03_core".unified_works) AS p
    FROM 
        "03_core".unified_works_train
    GROUP BY
        subset
    
    UNION ALL

    SELECT 
        subset,
        COUNT(*) AS c,
        COUNT(*) * 100.0 / (SELECT COUNT(*) FROM "03_core".unified_works) AS p
    FROM 
        "03_core".unified_works_test
    GROUP BY
        subset

    UNION ALL

    SELECT 
        subset,
        COUNT(*) AS c,
        COUNT(*) * 100.0 / (SELECT COUNT(*) FROM "03_core".unified_works) AS p
    FROM 
        "03_core".unified_works_validation
    GROUP BY
        subset
""", db_name))

Unnamed: 0,subset,c,p
0,train,316586,79.935463
1,validation,39713,10.027219
2,test,39753,10.037318


In [24]:
table_name = 'topics'
db_name = '03_core'
timelogger = utils.TimeLogger()
utils.create_table_from_sql_file(
    database_name = db_name,
    table_name = table_name,
    overwrite_strategy='overwrite', # options: fail, overwrite, ignore
    wait=True,
    s3_parent_target_path=config.S3_CORE_DATA_PATH
)
timelogger.log(f'"{db_name}".{table_name} created')

utils.pd_set_options(cols=300)
display(wr.athena.read_sql_query(f"""SELECT * FROM "{db_name}".{table_name} LIMIT 30 """, db_name))
display(wr.athena.read_sql_query(f"""SELECT COUNT(*) AS c FROM "{db_name}".{table_name} """, db_name))
timelogger.log(f'"{db_name}".{table_name} queries finished')

 :: :: TIMELOGGER STARTED :: | since_start: 0.00 seconds | since_last: 0.00 seconds :: 
Table 03_core.topics already exists. Overwriting since overwrite_strategy=="overwrite".
Deleting table from Glue Catalog 03_core topics
Deleting S3 objects from s3://sagemaker-research-methodology-extraction/01_data/03_core/topics/
s3_parent_target_path:  s3://sagemaker-research-methodology-extraction/01_data/03_core
 :: "03_core".topics created | since_start: 3.53 seconds | since_last: 3.53 seconds :: 


Unnamed: 0,topic_index,topic_original_id,topic_display_name,topic_count,subfield_original_id,subfield_display_name
0,6,10036,Advanced Neural Network Applications,6191,1707,Computer Vision and Pattern Recognition


Unnamed: 0,c
0,1


 :: "03_core".topics queries finished | since_start: 10.84 seconds | since_last: 7.31 seconds :: 


' :: "03_core".topics queries finished | since_start: 10.84 seconds | since_last: 7.31 seconds :: '

In [25]:
table_name = 'subfields'
db_name = '03_core'
timelogger = utils.TimeLogger()
utils.create_table_from_sql_file(
    database_name = db_name,
    table_name = table_name,
    overwrite_strategy='overwrite', # options: fail, overwrite, ignore
    wait=True,
    s3_parent_target_path=config.S3_CORE_DATA_PATH
)
timelogger.log(f'"{db_name}".{table_name} created')

utils.pd_set_options(cols=300)
display(wr.athena.read_sql_query(f"""SELECT * FROM "{db_name}".{table_name} LIMIT 3 """, db_name))
display(wr.athena.read_sql_query(f"""SELECT COUNT(*) AS c FROM "{db_name}".{table_name} """, db_name))
timelogger.log(f'"{db_name}".{table_name} queries finished')

 :: :: TIMELOGGER STARTED :: | since_start: 0.00 seconds | since_last: 0.00 seconds :: 
Table 03_core.subfields already exists. Overwriting since overwrite_strategy=="overwrite".
Deleting table from Glue Catalog 03_core subfields
Deleting S3 objects from s3://sagemaker-research-methodology-extraction/01_data/03_core/subfields/
s3_parent_target_path:  s3://sagemaker-research-methodology-extraction/01_data/03_core
 :: "03_core".subfields created | since_start: 3.66 seconds | since_last: 3.66 seconds :: 


Unnamed: 0,subfield_index,subfield_original_id,subfield_display_name,subfield_count
0,0,1707,Computer Vision and Pattern Recognition,6191


Unnamed: 0,c
0,1


 :: "03_core".subfields queries finished | since_start: 10.92 seconds | since_last: 7.26 seconds :: 


' :: "03_core".subfields queries finished | since_start: 10.92 seconds | since_last: 7.26 seconds :: '

In [26]:
table_name = 'stg_filtered_work_section_annotations'
db_name = '02_stg'
timelogger = utils.TimeLogger()
utils.create_table_from_sql_file(
    database_name=db_name,
    table_name=table_name,
    overwrite_strategy='overwrite', # options: fail, overwrite, ignore
    wait=True,
    # s3_parent_target_path=config.S3_CORE_DATA_PATH
)
timelogger.log(f'"{db_name}".{table_name} created')

utils.pd_set_options(cols=300)
display(wr.athena.read_sql_query(f"""SELECT * FROM "{db_name}".{table_name} LIMIT 3 """, db_name))
display(wr.athena.read_sql_query(f"""SELECT COUNT(*) AS c FROM "{db_name}".{table_name} """, db_name))
timelogger.log(f'"{db_name}".{table_name} queries finished')

 :: :: TIMELOGGER STARTED :: | since_start: 0.00 seconds | since_last: 0.00 seconds :: 
Table 02_stg.stg_filtered_work_section_annotations already exists. Overwriting since overwrite_strategy=="overwrite".
Deleting table from Glue Catalog 02_stg stg_filtered_work_section_annotations
Deleting S3 objects from s3://sagemaker-research-methodology-extraction/01_data/02_stg/stg_filtered_work_section_annotations/
s3_parent_target_path:  s3://sagemaker-research-methodology-extraction/01_data/02_stg
 :: "02_stg".stg_filtered_work_section_annotations created | since_start: 34.89 seconds | since_last: 34.89 seconds :: 


Unnamed: 0,work_id,section_index,section_type_index,n,section_start,section_end,section_type
0,1332286,108,86,,38225,38226,paragraph
1,1332286,109,87,,38230,38232,paragraph
2,1332286,110,88,,38236,38238,paragraph


Unnamed: 0,c
0,35215812


 :: "02_stg".stg_filtered_work_section_annotations queries finished | since_start: 42.45 seconds | since_last: 7.56 seconds :: 


' :: "02_stg".stg_filtered_work_section_annotations queries finished | since_start: 42.45 seconds | since_last: 7.56 seconds :: '

In [31]:
table_name = 'stg_filtered_work_sections'
db_name = '02_stg'
timelogger = utils.TimeLogger()
utils.create_table_from_sql_file(
    database_name=db_name,
    table_name=table_name,
    overwrite_strategy='overwrite', # options: fail, overwrite, ignore
    wait=True,
    # s3_parent_target_path=config.S3_CORE_DATA_PATH
)
timelogger.log(f'"{db_name}".{table_name} created')

utils.pd_set_options(cols=300)
display(wr.athena.read_sql_query(f"""SELECT * FROM "{db_name}".{table_name} LIMIT 3 """, db_name))
display(wr.athena.read_sql_query(f"""SELECT COUNT(*) AS c FROM "{db_name}".{table_name} """, db_name))
timelogger.log(f'"{db_name}".{table_name} queries finished')

 :: :: TIMELOGGER STARTED :: | since_start: 0.00 seconds | since_last: 0.00 seconds :: 
Table 02_stg.stg_filtered_work_sections already exists. Overwriting since overwrite_strategy=="overwrite".
Deleting table from Glue Catalog 02_stg stg_filtered_work_sections
Deleting S3 objects from s3://sagemaker-research-methodology-extraction/01_data/02_stg/stg_filtered_work_sections/
s3_parent_target_path:  s3://sagemaker-research-methodology-extraction/01_data/02_stg
 :: "02_stg".stg_filtered_work_sections created | since_start: 44.92 seconds | since_last: 44.92 seconds :: 


Unnamed: 0,work_id,section_index,section_type_index,n,section_start,section_end,section_type,section_text,section_text_length
0,14420282,25,20,,13140,13453,paragraph,"But this widely used strategy leads to a proliferation of error rules (e.g. PP ¡ Prep Det Name and so on). Moreover, more complex phenomena like agreement hardly can be modelled by such an anticipation based error diagnosis (but see Schneider/McCoy 1998, see also Menzel 1988 for a discussion of ...",312
1,14420282,26,21,,13455,13607,paragraph,The alternative to the anticipation of errors is an approach called model based diagnosis. We discuss this kind of error diagnosis in the next section.,151
2,14420282,27,6,3.3,13609,13626,header,Menzel's Approach,17


Unnamed: 0,c
0,35215812


 :: "02_stg".stg_filtered_work_sections queries finished | since_start: 54.93 seconds | since_last: 10.00 seconds :: 


' :: "02_stg".stg_filtered_work_sections queries finished | since_start: 54.93 seconds | since_last: 10.00 seconds :: '

In [32]:
table_name = 'stg_unified_works_filtered_with_section_stats'
db_name = '02_stg'
timelogger = utils.TimeLogger()
utils.create_table_from_sql_file(
    database_name=db_name,
    table_name=table_name,
    overwrite_strategy='overwrite', # options: fail, overwrite, ignore
    wait=True,
    # s3_parent_target_path=config.S3_CORE_DATA_PATH
)
timelogger.log(f'"{db_name}".{table_name} created')

utils.pd_set_options(cols=300)
display(wr.athena.read_sql_query(f"""SELECT * FROM "{db_name}".{table_name} LIMIT 3 """, db_name))
display(wr.athena.read_sql_query(f"""SELECT COUNT(*) AS c FROM "{db_name}".{table_name} """, db_name))
timelogger.log(f'"{db_name}".{table_name} queries finished')

 :: :: TIMELOGGER STARTED :: | since_start: 0.00 seconds | since_last: 0.00 seconds :: 
Table 02_stg.stg_unified_works_filtered_with_section_stats already exists. Overwriting since overwrite_strategy=="overwrite".
Deleting table from Glue Catalog 02_stg stg_unified_works_filtered_with_section_stats
Deleting S3 objects from s3://sagemaker-research-methodology-extraction/01_data/02_stg/stg_unified_works_filtered_with_section_stats/
s3_parent_target_path:  s3://sagemaker-research-methodology-extraction/01_data/02_stg
 :: "02_stg".stg_unified_works_filtered_with_section_stats created | since_start: 33.70 seconds | since_last: 33.70 seconds :: 


Unnamed: 0,id_semanticscholar,id_mag,id_doi,id_arxiv,publication_year,publication_date,license,license_allows_derivative_reuse,source_url,has_id_mag,has_id_doi,has_id_mag_or_doi,openalex_id_openalex,openalex_id_doi,openalex_language,openalex_primary_topic_id,openalex_primary_topic_display_name,openalex_primary_topic_count,openalex_primary_topic_index,openalex_primary_topic_subfield_id,openalex_primary_topic_subfield_display_name,openalex_primary_topic_subfield_count,openalex_primary_topic_subfield_index,openalex_primary_topic_field_id,openalex_primary_topic_field_display_name,openalex_primary_topic_domain_id,openalex_primary_topic_domain_display_name,openalex_joined_on,title,content_abstract,content_text,annotations_paragraph,annotations_section_header,subset,number_of_sections,number_of_headers,number_of_paragraphs,has_sections,has_headers,has_paragraphs
0,225232604,3080792298,10.36227/techrxiv.12816155,,2020,2020-08-24,CCBY,1,https://doi.org/10.36227/techrxiv.12816155,1,1,1,3080792298,10.36227/techrxiv.12816155.v1,en,12072,Machine Learning and Algorithms,,,1702,Artificial Intelligence,,,17,Computer Science,3,Physical Sciences,mag,A novel sequential RDF to compute partially observable Markov processes with MSE distortion,We develop a new sequential rate distortion functionto compute lower bounds on the average length of all causal prefix free codes for partially observable multivariate Markov processes with mean-squared error distortion constraint. Our information measure is characterized by a variant of causall...,"\nI. INTRODUCTION\n\nNonanticipatory −entropy was introduced in [2], [3] motived by real-time communication with minimal encoding and decoding delays. This entity is shown to be a tight lower bound on causal codes for scalar processes [4] whereas for vector processes it provides a tight lower bo...","[{""attributes"":null,""end"":422,""start"":18},{""attributes"":null,""end"":2530,""start"":424},{""attributes"":null,""end"":3249,""start"":2532},{""attributes"":null,""end"":4727,""start"":3251},{""attributes"":null,""end"":7215,""start"":4747},{""attributes"":null,""end"":10033,""start"":7217},{""attributes"":null,""end"":11088,""st...","[{""attributes"":null,""end"":16,""start"":1},{""attributes"":null,""end"":4745,""start"":4729},{""attributes"":null,""end"":11111,""start"":11090},{""attributes"":null,""end"":13448,""start"":13442},{""attributes"":null,""end"":13609,""start"":13587},{""attributes"":null,""end"":19858,""start"":19799},{""attributes"":null,""end"":214...",train,165,11,154,1,1,1
1,4546238,2952828455,10.1145/2840728.2840729,1504.06544,2015,2015-04-24,CCBYNC,1,https://arxiv.org/abs/1504.06544,1,1,1,2336478268,10.1145/2840728.2840729,en,12072,Machine Learning and Algorithms,,,1702,Artificial Intelligence,,,17,Computer Science,3,Physical Sciences,doi,Sampling Correctors,"In many situations, sample data is obtained from a noisy or imperfect source. In order to address such corruptions, this paper introduces the concept of a sampling corrector. Such algorithms use structure that the distribution is purported to have, in order to allow one to make ""on-the-fly"" corr...","\nIntroduction\n\nData consisting of samples from distributions is notorious for reliability issues: Sample data can be greatly affected by noise, calibration problems or other faults in the sample recording process; portions of data may be lost; extraneous samples may be erroneously recorded. S...","[{""attributes"":null,""end"":846,""start"":15},{""attributes"":null,""end"":2091,""start"":848},{""attributes"":null,""end"":2865,""start"":2093},{""attributes"":null,""end"":3270,""start"":2867},{""attributes"":null,""end"":3874,""start"":3272},{""attributes"":null,""end"":4897,""start"":3876},{""attributes"":null,""end"":6703,""star...","[{""attributes"":{""n"":""1""},""end"":13,""start"":1},{""attributes"":{""n"":""1.2""},""end"":4910,""start"":4899},{""attributes"":null,""end"":12794,""start"":12774},{""attributes"":{""n"":""1.3""},""end"":14350,""start"":14337},{""attributes"":{""n"":""2""},""end"":17443,""start"":17430},{""attributes"":{""n"":""3""},""end"":21640,""start"":21618}...",train,304,37,267,1,1,1
2,11918854,2950867794,10.1109/focs.2016.85,1604.06443,2016,2016-04-21,CCBYNC,1,https://arxiv.org/abs/1604.06443,1,1,1,2964008913,10.1109/focs.2016.85,en,12072,Machine Learning and Algorithms,,,1702,Artificial Intelligence,,,17,Computer Science,3,Physical Sciences,doi,Robust Estimators in High Dimensions without the Computational Intractability,"We study high-dimensional distribution learning in an agnostic setting where an adversary is allowed to arbitrarily corrupt an epsilon fraction of the samples. Such questions have a rich history spanning statistics, machine learning and theoretical computer science. Even in the most basic settin...","\nI. INTRODUCTION\n\nA. Background\n\nA central goal of machine learning is to design efficient algorithms for fitting a model to a collection of observations. In recent years, there has been considerable progress on a variety of problems in this domain, including algorithms with provable guaran...","[{""attributes"":null,""end"":574,""start"":33},{""attributes"":null,""end"":804,""start"":576},{""attributes"":null,""end"":2118,""start"":806},{""attributes"":null,""end"":2718,""start"":2120},{""attributes"":null,""end"":3505,""start"":2720},{""attributes"":null,""end"":4662,""start"":3507},{""attributes"":null,""end"":5331,""start""...","[{""attributes"":null,""end"":16,""start"":1},{""attributes"":null,""end"":31,""start"":18},{""attributes"":null,""end"":4681,""start"":4664},{""attributes"":null,""end"":11153,""start"":11139},{""attributes"":null,""end"":16429,""start"":16399},{""attributes"":null,""end"":19283,""start"":19260},{""attributes"":null,""end"":20427,""st...",train,100,16,84,1,1,1


Unnamed: 0,c
0,396052


 :: "02_stg".stg_unified_works_filtered_with_section_stats queries finished | since_start: 46.12 seconds | since_last: 12.42 seconds :: 


' :: "02_stg".stg_unified_works_filtered_with_section_stats queries finished | since_start: 46.12 seconds | since_last: 12.42 seconds :: '

In [33]:
table_name = 'stg_filtered_work_sections_with_headers'
db_name = '02_stg'
timelogger = utils.TimeLogger()
utils.create_table_from_sql_file(
    database_name=db_name,
    table_name=table_name,
    overwrite_strategy='overwrite', # options: fail, overwrite, ignore
    wait=True,
    # s3_parent_target_path=config.S3_CORE_DATA_PATH
)
timelogger.log(f'"{db_name}".{table_name} created')

utils.pd_set_options(cols=300)
display(wr.athena.read_sql_query(f"""SELECT * FROM "{db_name}".{table_name} LIMIT 3 """, db_name))
display(wr.athena.read_sql_query(f"""SELECT COUNT(*) AS c FROM "{db_name}".{table_name} """, db_name))
timelogger.log(f'"{db_name}".{table_name} queries finished')

 :: :: TIMELOGGER STARTED :: | since_start: 0.00 seconds | since_last: 0.00 seconds :: 
Table 02_stg.stg_filtered_work_sections_with_headers already exists. Overwriting since overwrite_strategy=="overwrite".
Deleting table from Glue Catalog 02_stg stg_filtered_work_sections_with_headers
Deleting S3 objects from s3://sagemaker-research-methodology-extraction/01_data/02_stg/stg_filtered_work_sections_with_headers/
s3_parent_target_path:  s3://sagemaker-research-methodology-extraction/01_data/02_stg
 :: "02_stg".stg_filtered_work_sections_with_headers created | since_start: 34.14 seconds | since_last: 34.14 seconds :: 


Unnamed: 0,work_id,section_index,section_type_index,n,section_start,section_end,section_type,section_text,section_text_length,work_has_sections,work_has_headers,work_has_paragraphs,work_number_of_sections,work_number_of_headers,work_number_of_paragraphs
0,250649874,92,65,,36865,36866,paragraph,,0,1,1,1,137,27,110
1,250649874,93,66,,36870,36871,paragraph,,0,1,1,1,137,27,110
2,250649874,94,67,,36875,36876,paragraph,,0,1,1,1,137,27,110


Unnamed: 0,c
0,35173870


 :: "02_stg".stg_filtered_work_sections_with_headers queries finished | since_start: 44.12 seconds | since_last: 9.98 seconds :: 


' :: "02_stg".stg_filtered_work_sections_with_headers queries finished | since_start: 44.12 seconds | since_last: 9.98 seconds :: '

In [38]:
table_name = 'stg_filtered_work_chapters'
db_name = '02_stg'
timelogger = utils.TimeLogger()
utils.create_table_from_sql_file(
    database_name=db_name,
    table_name=table_name,
    overwrite_strategy='overwrite', # options: fail, overwrite, ignore
    wait=True,
    # s3_parent_target_path=config.S3_CORE_DATA_PATH
)
timelogger.log(f'"{db_name}".{table_name} created')

utils.pd_set_options(cols=300)
display(wr.athena.read_sql_query(f"""SELECT * FROM "{db_name}".{table_name} LIMIT 3 """, db_name))
display(wr.athena.read_sql_query(f"""SELECT COUNT(*) AS c FROM "{db_name}".{table_name} """, db_name))
timelogger.log(f'"{db_name}".{table_name} queries finished')

 :: :: TIMELOGGER STARTED :: | since_start: 0.00 seconds | since_last: 0.00 seconds :: 
Table 02_stg.stg_filtered_work_chapters already exists. Overwriting since overwrite_strategy=="overwrite".
Deleting table from Glue Catalog 02_stg stg_filtered_work_chapters
Deleting S3 objects from s3://sagemaker-research-methodology-extraction/01_data/02_stg/stg_filtered_work_chapters/
s3_parent_target_path:  s3://sagemaker-research-methodology-extraction/01_data/02_stg
 :: "02_stg".stg_filtered_work_chapters created | since_start: 48.03 seconds | since_last: 48.03 seconds :: 


Unnamed: 0,work_id,header_index,number_of_paragraphs_in_chapter,min_section_start,max_section_end,header_text,header_text_lower,header_text_length,chapter_text,chapter_text_length,section_starts,section_ends,section_indices,section_type_indices,chapter_is_research_methodology
0,258037205,6,5,15137,17314,Fuzzy Min-Max Neural Network,fuzzy min-max neural network,28,Simpson pioneered the hyperboxes for pattern classification [16].FMM learns using a hyperbox fuzzy set.An expansion parameter theta (θ) controls the size of the hyperbox; in this case the theta (θ) ranges from values 0 to 1.The maximum (max) and minimum (min) points in a hyperbox are used to mea...,2147,"[15137, 15167, 15559, 15750, 16795, 16924]","[15165, 15557, 15748, 16793, 16922, 17314]","[31, 32, 33, 34, 35, 36]","[6, 26, 27, 28, 29, 30]",
1,272459276,2,1,6530,7286,•,•,1,"The GFPN network architecture is enhanced by incorporating the concept of Dysample offset, aiming to address the limitations of the original GFPN architecture and effectively extract deep information from low-light scenes as well as to capture potential information within the low-light dataset. ...",752,"[6530, 6533]","[6531, 7286]","[10, 11]","[2, 9]",
2,258037205,5,1,14547,15135,Module 2 3.2.1. Machine Learning Classifiers,module 2 3.2.1. machine learning classifiers,44,"Classification is a machine learning method that determines which class a new object belongs to based on a set of predefined classes.There are numerous classifiers that can be used to classify data, including decision trees, bays, functions, rules, lazy, meta, and so on.In this work we used diff...",542,"[14547, 14593]","[14591, 15135]","[29, 30]","[5, 25]",


Unnamed: 0,c
0,6834562


 :: "02_stg".stg_filtered_work_chapters queries finished | since_start: 58.15 seconds | since_last: 10.12 seconds :: 


' :: "02_stg".stg_filtered_work_chapters queries finished | since_start: 58.15 seconds | since_last: 10.12 seconds :: '