In [2]:
import requests
import boto3
import json
import os
import sys
import argparse
import importlib
import transformers
import torch
import pathlib
import awswrangler as wr
from IPython.display import display
from sagemaker.huggingface.processing import HuggingFaceProcessor
from sagemaker.sklearn.processing import SKLearnProcessor
from sagemaker.processing import FrameworkProcessor
from sagemaker.sklearn.estimator import SKLearn
from sagemaker.workflow.steps import ProcessingStep
from sagemaker.workflow.pipeline_context import PipelineSession
from sagemaker.processing import ProcessingInput, ProcessingOutput
from sagemaker.session import get_execution_role


# Adding ../01_modules or ./01_modules to the system path so that we can load modules from 
# there as well
if '__file__' in globals():
    script_dir = pathlib.Path(__file__).parent.resolve()
else:
    script_dir = pathlib.Path().absolute()
modules_path_in_dev = os.path.abspath(os.path.join(script_dir, '..', '01_modules'))
modules_path_in_prod = os.path.abspath(os.path.join(script_dir, '01_modules'))
if os.path.exists(modules_path_in_dev):
    sys.path.append(modules_path_in_dev)
if os.path.exists(modules_path_in_prod):
    sys.path.append(modules_path_in_prod)


# # Jupyter only reads a local module the first time after 
# # kernel start. Re-running a cell with 
# # "from mymodulename import *" would not change
# # anything, even if the imported module has since changed.
# # As a workaround, we need to directly load the module, 
# # use importlib.reload to reload it and then import * 
import utils
_ = importlib.reload(utils)
import config
_ = importlib.reload(config) 


sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/sagemaker-user/.config/sagemaker/config.yaml
config.py loaded: v0.1
utils.py loaded: v0.2.12
utils.py loaded: v0.2.12
config.py loaded: v0.1


In [2]:
if False: # do not run this again, takes a day
    execution_role = get_execution_role()
    source_dir = os.path.abspath(os.path.join(os.getcwd(), '..', '..'))
    print('source_dir:', source_dir)
    sklearn_processor = FrameworkProcessor(
        estimator_cls=SKLearn,
        framework_version='1.2-1', # The newest supported version by sagemaker
        instance_type='ml.c7i.16xlarge',
        instance_count=1,
        base_job_name=f'openalex_works_reduction'.replace('_','-'),
        role=execution_role
    )
    
    step_args = sklearn_processor.run(
        code='src/03_transformation/03_11_transformation_openalex_works_reduction.py',
        source_dir=source_dir,
        inputs=[], # We are not using automatic input-output mapping, instead we handle everything in the script directly on S3
        outputs=[],
        arguments=[
            '--runtype', 'prod',
            '--file-max-limit', '10000',
        ],
        wait=True
    )

In [30]:
table_name = 'base_openalex_works_reduced_filtered'
timelogger = utils.TimeLogger()
utils.create_table_from_sql_file(
    database_name = '02_stg',
    table_name = table_name,
    overwrite_strategy='overwrite', # options: fail, overwrite, ignore
    wait=True,
)
timelogger.log(f'"02_stg".{table_name} created')

utils.pd_set_options(cols=500)
display(wr.athena.read_sql_query(f"""SELECT * FROM "02_stg".{table_name} LIMIT 3 """, '02_stg'))
display(wr.athena.read_sql_query(f"""SELECT COUNT(*) AS c FROM "02_stg".{table_name} """, '02_stg'))
timelogger.log(f'"02_stg".{table_name} queries finished')

 :: :: TIMELOGGER STARTED :: | since_start: 0.00 seconds | since_last: 0.00 seconds :: 
Deleting S3 objects from s3://sagemaker-research-methodology-extraction/01_data/02_stg/base_openalex_works_reduced_filtered/
s3_parent_target_path:  s3://sagemaker-research-methodology-extraction/01_data/02_stg
 :: "02_stg".base_openalex_works_reduced_filtered created | since_start: 42.36 seconds | since_last: 42.36 seconds :: 


Unnamed: 0,id_openalex,id_doi,title,language,primary_topic_id,primary_topic_display_name,primary_topic_subfield_id,primary_topic_subfield_display_name,primary_topic_field_id,primary_topic_field_display_name,primary_topic_domain_id,primary_topic_domain_display_name
0,2184395120,,AN APPROACH FOR COMPRESSING DIGITAL IMAGES BY USING RUN LENGTH ENCODING,en,T10901,Advanced Data Compression Techniques,1707,Computer Vision and Pattern Recognition,17,Computer Science,3,Physical Sciences
1,2184397778,,LEARNINGMINIMUM VOLUME SETSWITH SUPPORTVECTOR MACHINES,en,T11512,Anomaly Detection Techniques and Applications,1702,Artificial Intelligence,17,Computer Science,3,Physical Sciences
2,2184397944,,Analyzing thePerformance ofVoice over Internet Protocol ina 3GNetwork,en,T10575,Wireless Communication Networks Research,1705,Computer Networks and Communications,17,Computer Science,3,Physical Sciences


Unnamed: 0,c
0,13242469


 :: "02_stg".base_openalex_works_reduced_filtered queries finished | since_start: 50.44 seconds | since_last: 8.08 seconds :: 


' :: "02_stg".base_openalex_works_reduced_filtered queries finished | since_start: 50.44 seconds | since_last: 8.08 seconds :: '

In [3]:
table_name = 'base_semanticscholar_s2orcv2'
timelogger = utils.TimeLogger()
utils.create_table_from_sql_file(
    database_name = '02_stg',
    table_name = table_name,
    overwrite_strategy='overwrite', # options: fail, overwrite, ignore
    wait=True,
)
timelogger.log(f'"02_stg".{table_name} created')

utils.pd_set_options(cols=500)
display(wr.athena.read_sql_query(f"""SELECT * FROM "02_stg".{table_name} LIMIT 3 """, '02_stg'))
display(wr.athena.read_sql_query(f"""SELECT COUNT(*) AS c FROM "02_stg".{table_name} """, '02_stg'))
timelogger.log(f'"02_stg".{table_name} queries finished')

 :: :: TIMELOGGER STARTED :: | since_start: 0.00 seconds | since_last: 0.00 seconds :: 
Table 02_stg.base_semanticscholar_s2orcv2 already exists. Overwriting since overwrite_strategy=="overwrite".
Deleting table from Glue Catalog 02_stg base_semanticscholar_s2orcv2
Deleting S3 objects from s3://sagemaker-research-methodology-extraction/01_data/02_stg/base_semanticscholar_s2orcv2/
s3_parent_target_path:  s3://sagemaker-research-methodology-extraction/01_data/02_stg
 :: "02_stg".base_semanticscholar_s2orcv2 created | since_start: 11.0 minutes, 1.71 seconds | since_last: 11.0 minutes, 1.71 seconds :: 


Unnamed: 0,id_semanticscholar,id_mag,id_doi,id_arxiv,title,source_url,openaccess_status,content_text,annotations_paragraph,annotations_section_header,license
0,1808709,2101143116,10.1186/1476-511x-13-132,,Twist 1 regulates the expression of PPARγ during hormone-induced 3T3-L1 preadipocyte differentiation: a possible role in obesity and associated diseases,https://pmc.ncbi.nlm.nih.gov/articles/PMC4150960,GOLD,"\nBackground\n\nObesity has become an epidemic in the human population, and China has the highest number of obese patients in the world [1]. Because obesity involves an increase in the number of adipocytes, any of the factors involved in adipocyte differentiation might be of great importance for the development of obesity. To date, numerous factors and proteins have been implicated in the generation of new fat cells, including peroxisome proliferator-activated receptor gamma (PPARγ) [2,3], C...","[{""attributes"":null,""end"":781,""start"":13},{""attributes"":null,""end"":2087,""start"":783},{""attributes"":null,""end"":3470,""start"":2089},{""attributes"":null,""end"":4312,""start"":3472},{""attributes"":null,""end"":5329,""start"":4323},{""attributes"":null,""end"":5818,""start"":5409},{""attributes"":null,""end"":6151,""start"":5890},{""attributes"":null,""end"":6744,""start"":6218},{""attributes"":null,""end"":8054,""start"":6870},{""attributes"":null,""end"":8511,""start"":8056},{""attributes"":null,""end"":9031,""start"":8525},{""attributes"":n...","[{""attributes"":null,""end"":11,""start"":1},{""attributes"":null,""end"":4321,""start"":4314},{""attributes"":null,""end"":5407,""start"":5331},{""attributes"":null,""end"":5888,""start"":5820},{""attributes"":null,""end"":6216,""start"":6153},{""attributes"":null,""end"":6868,""start"":6746},{""attributes"":null,""end"":8523,""start"":8513},{""attributes"":null,""end"":16548,""start"":16527},{""attributes"":null,""end"":16559,""start"":16550},{""attributes"":null,""end"":17519,""start"":17439},{""attributes"":null,""end"":19472,""start"":19445},{""attrib...",CCBY
1,3738414,2789005638,10.1155/2018/1042479,,PROM and Labour Effects on Urinary Metabolome: A Pilot Study,https://pmc.ncbi.nlm.nih.gov/articles/PMC5817378,GOLD,"\nIntroduction\n\nThe early diagnosis of pregnancy-related complications and the prediction of pregnancy outcome are considered strategic clinical goals to ensure the health of mothers and of their babies. Among these, premature rupture of membranes (PROM) consists of the rupture of the foetal membranes before the onset of labour. It can be observed at any gestational age [1] and occurs in approximately 10% of pregnant women and in roughly 40% of preterm deliveries [2]. Foetal membranes are ...","[{""attributes"":null,""end"":4623,""start"":15},{""attributes"":null,""end"":6006,""start"":4703},{""attributes"":null,""end"":7040,""start"":6040},{""attributes"":null,""end"":7808,""start"":7063},{""attributes"":null,""end"":9464,""start"":7836},{""attributes"":null,""end"":10835,""start"":9466},{""attributes"":null,""end"":11411,""start"":10861},{""attributes"":null,""end"":12575,""start"":11413},{""attributes"":null,""end"":12831,""start"":12585},{""attributes"":null,""end"":12984,""start"":12833},{""attributes"":null,""end"":13373,""start"":12986},{""...","[{""attributes"":{""n"":""1.""},""end"":13,""start"":1},{""attributes"":{""n"":""2.""},""end"":4646,""start"":4625},{""attributes"":{""n"":""2.1.""},""end"":4701,""start"":4648},{""attributes"":{""n"":""2.2.""},""end"":6038,""start"":6008},{""attributes"":{""n"":""2.3.""},""end"":7061,""start"":7042},{""attributes"":{""n"":""2.4.""},""end"":7834,""start"":7810},{""attributes"":{""n"":""3.""},""end"":10859,""start"":10837},{""attributes"":{""n"":""3.3.""},""end"":12583,""start"":12577},{""attributes"":{""n"":""4.""},""end"":13385,""start"":13375},{""attributes"":{""n"":""5.""},""end"":167...",CCBY
2,15710447,2741402565,10.18653/v1/w17-1005,,Word Embedding and Topic Modeling Enhanced Multiple Features for Content Linking and Argument / Sentiment Labeling in Online Forums,https://aclanthology.org/W17-1005,HYBRID,"\nIntroduction\n\nComments to news and their providers in online forums have been increasing rapidly in recent years with a large number of user participants and huge amount of interactive contents. How can we understand the mass of comments effectively? A crucial initial step towards this goal should be content linking, which is to determine what comments link to, be that either specific news snippets or comments by other users. Furthermore, a set of labels for a given link may be articulat...","[{""attributes"":null,""end"":585,""start"":15},{""attributes"":null,""end"":813,""start"":587},{""attributes"":null,""end"":1141,""start"":815},{""attributes"":null,""end"":1511,""start"":1143},{""attributes"":null,""end"":2352,""start"":1531},{""attributes"":null,""end"":3126,""start"":2363},{""attributes"":null,""end"":3193,""start"":3145},{""attributes"":null,""end"":3418,""start"":3211},{""attributes"":null,""end"":3543,""start"":3452},{""attributes"":null,""end"":3672,""start"":3545},{""attributes"":null,""end"":3984,""start"":3674},{""attributes"":nul...","[{""attributes"":{""n"":""1""},""end"":13,""start"":1},{""attributes"":{""n"":""2""},""end"":1529,""start"":1513},{""attributes"":{""n"":""3""},""end"":2361,""start"":2354},{""attributes"":{""n"":""3.1""},""end"":3143,""start"":3128},{""attributes"":{""n"":""3.1.1""},""end"":3209,""start"":3195},{""attributes"":{""n"":""3.1.2""},""end"":3450,""start"":3420},{""attributes"":{""n"":""3.1.3""},""end"":5187,""start"":5154},{""attributes"":{""n"":""3.2""},""end"":6316,""start"":6302},{""attributes"":null,""end"":7038,""start"":7024},{""attributes"":null,""end"":7173,""start"":7150},{""at...",CCBY


Unnamed: 0,c
0,11609787


 :: "02_stg".base_semanticscholar_s2orcv2 queries finished | since_start: 11.0 minutes, 11.75 seconds | since_last: 10.04 seconds :: 


' :: "02_stg".base_semanticscholar_s2orcv2 queries finished | since_start: 11.0 minutes, 11.75 seconds | since_last: 10.04 seconds :: '

In [4]:
table_name = 'stg_semanticscholar_combined_works'
timelogger = utils.TimeLogger()
utils.create_table_from_sql_file(
    database_name = '02_stg',
    table_name = table_name,
    overwrite_strategy='overwrite', # options: fail, overwrite, ignore
    wait=True,
)
timelogger.log(f'"02_stg".{table_name} created')

utils.pd_set_options(cols=500)
display(wr.athena.read_sql_query(f"""SELECT * FROM "02_stg".{table_name} LIMIT 3 """, '02_stg'))
display(wr.athena.read_sql_query(f"""SELECT COUNT(*) AS c FROM "02_stg".{table_name} """, '02_stg'))
timelogger.log(f'"02_stg".{table_name} queries finished')

 :: :: TIMELOGGER STARTED :: | since_start: 0.00 seconds | since_last: 0.00 seconds :: 
Table 02_stg.stg_semanticscholar_combined_works already exists. Overwriting since overwrite_strategy=="overwrite".
Deleting table from Glue Catalog 02_stg stg_semanticscholar_combined_works
Deleting S3 objects from s3://sagemaker-research-methodology-extraction/01_data/02_stg/stg_semanticscholar_combined_works/
s3_parent_target_path:  s3://sagemaker-research-methodology-extraction/01_data/02_stg
 :: "02_stg".stg_semanticscholar_combined_works created | since_start: 13.0 minutes, 48.32 seconds | since_last: 13.0 minutes, 48.32 seconds :: 


Unnamed: 0,id_semanticscholar,id_mag,id_doi,id_arxiv,title,source_url,openaccess_status,content_text,annotations_paragraph,annotations_section_header,license,content_abstract,publication_year,publication_date
0,220531160,3042431448.0,10.1167/iovs.61.8.16,,Quantitative Fundus Autofluorescence in Rhesus Macaques in Aging and Age-Related Drusen,https://pmc.ncbi.nlm.nih.gov/articles/PMC7425688,GOLD,"\nD iseases of the macula, such as age-related macular degeneration (AMD) and diabetic macular edema, are leading causes of visual impairment in developed countries. 1 Animal models of macular conditions can further detail the mechanisms of their pathogenesis and reveal new insights into developing novel interventions. Nonhuman primates (NHPs) are a compelling animal model for studying macular diseases as they are the only mammals beside humans to possess a true macula. NHPs, such as rhesus ...","[{""attributes"":null,""end"":1340,""start"":1},{""attributes"":null,""end"":2554,""start"":1342},{""attributes"":null,""end"":4618,""start"":2580},{""attributes"":null,""end"":6008,""start"":4640},{""attributes"":null,""end"":7798,""start"":6010},{""attributes"":null,""end"":9037,""start"":7824},{""attributes"":null,""end"":10039,""start"":9078},{""attributes"":null,""end"":11377,""start"":10063},{""attributes"":null,""end"":12469,""start"":11391},{""attributes"":null,""end"":13219,""start"":12492},{""attributes"":null,""end"":13875,""start"":13257},{""att...","[{""attributes"":null,""end"":2563,""start"":2556},{""attributes"":null,""end"":2578,""start"":2565},{""attributes"":null,""end"":4638,""start"":4620},{""attributes"":null,""end"":7822,""start"":7800},{""attributes"":null,""end"":9076,""start"":9039},{""attributes"":null,""end"":10061,""start"":10041},{""attributes"":null,""end"":11389,""start"":11379},{""attributes"":null,""end"":12478,""start"":12471},{""attributes"":null,""end"":12490,""start"":12480},{""attributes"":null,""end"":13255,""start"":13221},{""attributes"":null,""end"":14316,""start"":14284}...",CCBYNCND,"Purpose To employ quantitative fundus autofluorescence (qAF) imaging in rhesus macaques to noninvasively assess retinal pigment epithelial (RPE) lipofuscin in nonhuman primates (NHPs) as a model of aging and age-related macular degeneration (AMD). Methods The qAF imaging was performed on eyes of 26 rhesus macaques (mean age 18.8 ± 8.2 years, range 4–27 years) with normal-appearing fundus or with age-related soft drusen using a confocal scanning laser ophthalmoscope with 488 nm excitation and...",2020,2020-07-01
1,268446036,,10.1007/s40670-024-02017-9,,Humanism Rounds: A Multifaceted “Back to Bedside” Initiative to Improve Meaning at Work for Internal Medicine Residents,https://pmc.ncbi.nlm.nih.gov/articles/PMC11180076,HYBRID,"\nIntroduction\n\nBurnout affects medical residents nationwide, leading to poor resident wellbeing, career dissatisfaction, and decreased quality of patient care [1,2].The rates of burnout among residents range from 27 to 75%, with high rates noted in obstetrics and gynecology (75%), internal medicine (63%), and general surgery (40%) with the lowest rate among family medicine residents (27%) [3].Research into burnout during residency has focused on a variety of contributing factors including...","[{""attributes"":null,""end"":1032,""start"":15},{""attributes"":null,""end"":1487,""start"":1034},{""attributes"":null,""end"":1960,""start"":1489},{""attributes"":null,""end"":2269,""start"":1985},{""attributes"":null,""end"":3087,""start"":2291},{""attributes"":null,""end"":3559,""start"":3118},{""attributes"":null,""end"":4961,""start"":3612},{""attributes"":null,""end"":5991,""start"":4980},{""attributes"":null,""end"":6547,""start"":5993},{""attributes"":null,""end"":7174,""start"":6571},{""attributes"":null,""end"":7594,""start"":7185},{""attributes""...","[{""attributes"":null,""end"":13,""start"":1},{""attributes"":null,""end"":1983,""start"":1962},{""attributes"":null,""end"":2289,""start"":2271},{""attributes"":null,""end"":3103,""start"":3089},{""attributes"":null,""end"":3116,""start"":3105},{""attributes"":null,""end"":3590,""start"":3561},{""attributes"":null,""end"":3610,""start"":3592},{""attributes"":null,""end"":4978,""start"":4963},{""attributes"":null,""end"":6569,""start"":6549},{""attributes"":null,""end"":7183,""start"":7176},{""attributes"":null,""end"":7641,""start"":7596},{""attributes"":nu...",CCBY,"Introduction Burnout is an increasingly prevalent problem among resident physicians. To address this problem, the Accreditation Council on Graduate Medical Education (ACGME) created the Back to Bedside initiative, supporting resident-driven projects focused on increasing direct interactions with patients. In 2017, Baylor College of Medicine (BCM) Internal Medicine Residency received a Back to Bedside grant to develop and implement “Humanism Rounds,” a multifaceted program which sought to pro...",2024,2024-03-13
2,249401160,,10.1007/s12471-022-01700-z,,Major adverse cardiovascular events in older emergency department patients presenting with non-cardiac medical complaints,https://pmc.ncbi.nlm.nih.gov/articles/PMC9691805,GOLD,"\nIntroduction\n\nOlder patients are at high risk of adverse outcomes after an emergency department (ED) visit [1,2]. However, the risk of major adverse cardiovascular events (MACE) for older ED patients, presenting with noncardiac medical complaints, is unknown. Because preventive measures may improve outcome [3], early identification of patients at risk is highly important. \n\nBesides conventional cardiovascular risk factors, the cardiac biomarkers high-sensitivity cardiac Troponin T (hs-...","[{""attributes"":null,""end"":376,""start"":15},{""attributes"":null,""end"":783,""start"":378},{""attributes"":null,""end"":1290,""start"":785},{""attributes"":null,""end"":2491,""start"":1337},{""attributes"":null,""end"":2771,""start"":2493},{""attributes"":null,""end"":3143,""start"":2796},{""attributes"":null,""end"":4944,""start"":3162},{""attributes"":null,""end"":5900,""start"":4946},{""attributes"":null,""end"":6149,""start"":5902},{""attributes"":null,""end"":6368,""start"":6169},{""attributes"":null,""end"":6672,""start"":6385},{""attributes"":nul...","[{""attributes"":null,""end"":13,""start"":1},{""attributes"":null,""end"":1299,""start"":1292},{""attributes"":null,""end"":1335,""start"":1301},{""attributes"":null,""end"":2794,""start"":2773},{""attributes"":null,""end"":3160,""start"":3145},{""attributes"":null,""end"":6167,""start"":6151},{""attributes"":null,""end"":6383,""start"":6370},{""attributes"":null,""end"":8057,""start"":8050},{""attributes"":null,""end"":8103,""start"":8059},{""attributes"":null,""end"":9631,""start"":9566},{""attributes"":null,""end"":10054,""start"":10031},{""attributes"":...",CCBY,"The risk of major adverse cardiovascular events (MACE) for older emergency department (ED) patients presenting with non-cardiac medical complaints is unknown. To apply preventive measures timely, early identification of high-risk patients is incredibly important. We aimed at investigating the incidence of MACE within one year after their ED visit and the predictive value of high-sensitivity cardiac troponin T (hs-cTnT) and N‑terminal pro-B-type natriuretic peptide (NT-proBNP) for subsequent ...",2022,2022-06-07


Unnamed: 0,c
0,11609787


 :: "02_stg".stg_semanticscholar_combined_works queries finished | since_start: 13.0 minutes, 58.61 seconds | since_last: 10.29 seconds :: 


' :: "02_stg".stg_semanticscholar_combined_works queries finished | since_start: 13.0 minutes, 58.61 seconds | since_last: 10.29 seconds :: '

In [18]:
table_name = 'stg_semanticscholar_combined_works_content'
timelogger = utils.TimeLogger()
utils.create_table_from_sql_file(
    database_name = '02_stg',
    table_name = table_name,
    overwrite_strategy='overwrite', # options: fail, overwrite, ignore
    wait=True,
)
timelogger.log(f'"02_stg".{table_name} created')

utils.pd_set_options(cols=300)
display(wr.athena.read_sql_query(f"""SELECT * FROM "02_stg".{table_name} LIMIT 3 """, '02_stg'))
display(wr.athena.read_sql_query(f"""SELECT COUNT(*) AS c FROM "02_stg".{table_name} """, '02_stg'))
timelogger.log(f'"02_stg".{table_name} queries finished')

 :: :: TIMELOGGER STARTED :: | since_start: 0.00 seconds | since_last: 0.00 seconds :: 
Deleting S3 objects from s3://sagemaker-research-methodology-extraction/01_data/02_stg/stg_semanticscholar_combined_works_content/
s3_parent_target_path:  s3://sagemaker-research-methodology-extraction/01_data/02_stg
 :: "02_stg".stg_semanticscholar_combined_works_content created | since_start: 11.0 minutes, 20.68 seconds | since_last: 11.0 minutes, 20.68 seconds :: 


Unnamed: 0,id_semanticscholar,title,content_abstract,content_text,annotations_paragraph,annotations_section_header
0,260163299,Screening depression and anxiety in Indigenous peoples: A global scoping review,"Indigenous peoples’ worldviews are intricately interconnected and interrelated with their communities and the environments in which they live. Their worldviews also manifest in a holistic view of health and well-being, which contrasts with those of the dominant western biomedical model. However,...","\nIntroduction\n\nThe worldviews of Indigenous peoples are intricately interrelated and interconnected with those of their communities and the environments in which they live. Indigenous people conceptualise health and well-being more holistically (Gall et al., 2021) than the dominant western bi...","[{""attributes"":null,""end"":512,""start"":15},{""attributes"":null,""end"":1314,""start"":514},{""attributes"":null,""end"":2095,""start"":1316},{""attributes"":null,""end"":3698,""start"":2097},{""attributes"":null,""end"":4904,""start"":3700},{""attributes"":null,""end"":5652,""start"":4906},{""attributes"":null,""end"":6585,""star...","[{""attributes"":null,""end"":13,""start"":1},{""attributes"":null,""end"":5660,""start"":5654},{""attributes"":null,""end"":7233,""start"":7215},{""attributes"":null,""end"":11222,""start"":11207},{""attributes"":null,""end"":12489,""start"":12475},{""attributes"":null,""end"":13587,""start"":13572},{""attributes"":null,""end"":14789..."
1,112601881,Model Development of a Blast Furnace Stove,,\nIntroduction\n\nAbout one third of the world primary energy consumption is from the manufacturing industries. The iron and steel industry (ISI) is the second largest energy user and accounts for 20 % of the energy usage by the manufacturing industries [1]. Due to heavy reliance on fossil fuels...,"[{""attributes"":null,""end"":519,""start"":15},{""attributes"":null,""end"":1303,""start"":521},{""attributes"":null,""end"":1735,""start"":1305},{""attributes"":null,""end"":2270,""start"":1737},{""attributes"":null,""end"":2995,""start"":2272},{""attributes"":null,""end"":3406,""start"":2997},{""attributes"":null,""end"":3691,""star...","[{""attributes"":{""n"":""1.""},""end"":13,""start"":1},{""attributes"":{""n"":""2.""},""end"":3414,""start"":3408},{""attributes"":null,""end"":4208,""start"":4172},{""attributes"":{""n"":""2.1.""},""end"":5536,""start"":5527},{""attributes"":{""n"":""2.2.""},""end"":7243,""start"":7228},{""attributes"":{""n"":""2.3.""},""end"":7941,""start"":7909},..."
2,118573517,Double beta decay transition mechanism,"After briefly reviewing $\beta \beta$ decay as a test of the neutrino mass, I examine the nuclear structure involved in this process. Simple formulas (\`{a} la Pad\'{e}) are designed for the transition amplitudes and the general behavior of $\beta \beta$ decay amplitudes in the quasiparticle ran...",\nIntroduction\n\nThe double beta (ββ) decay is a nice example of the interrelation between the Particle Physics and the Nuclear Physics: we can get information on the properties of the neutrino and the weak interaction from the ββ decay only if we know who to deal we the nuclear structure invol...,"[{""attributes"":null,""end"":582,""start"":15},{""attributes"":null,""end"":1069,""start"":584},{""attributes"":null,""end"":1224,""start"":1071},{""attributes"":null,""end"":1310,""start"":1226},{""attributes"":null,""end"":1343,""start"":1312},{""attributes"":null,""end"":1420,""start"":1345},{""attributes"":null,""end"":1839,""star...","[{""attributes"":{""n"":""1.""},""end"":13,""start"":1},{""attributes"":{""n"":""3.""},""end"":10044,""start"":9997},{""attributes"":{""n"":""4.""},""end"":14193,""start"":14165}]"


Unnamed: 0,c
0,11609787


 :: "02_stg".stg_semanticscholar_combined_works_content queries finished | since_start: 11.0 minutes, 31.19 seconds | since_last: 10.51 seconds :: 


' :: "02_stg".stg_semanticscholar_combined_works_content queries finished | since_start: 11.0 minutes, 31.19 seconds | since_last: 10.51 seconds :: '

In [19]:
table_name = 'stg_semanticscholar_combined_works_metadata'
timelogger = utils.TimeLogger()
utils.create_table_from_sql_file(
    database_name = '02_stg',
    table_name = table_name,
    overwrite_strategy='overwrite', # options: fail, overwrite, ignore
    wait=True,
)
timelogger.log(f'"02_stg".{table_name} created')

utils.pd_set_options(cols=300)
display(wr.athena.read_sql_query(f"""SELECT * FROM "02_stg".{table_name} LIMIT 3 """, '02_stg'))
display(wr.athena.read_sql_query(f"""SELECT COUNT(*) AS c FROM "02_stg".{table_name} """, '02_stg'))
timelogger.log(f'"02_stg".{table_name} queries finished')

 :: :: TIMELOGGER STARTED :: | since_start: 0.00 seconds | since_last: 0.00 seconds :: 
Deleting S3 objects from s3://sagemaker-research-methodology-extraction/01_data/02_stg/stg_semanticscholar_combined_works_metadata/
s3_parent_target_path:  s3://sagemaker-research-methodology-extraction/01_data/02_stg
 :: "02_stg".stg_semanticscholar_combined_works_metadata created | since_start: 5.85 seconds | since_last: 5.85 seconds :: 


Unnamed: 0,id_semanticscholar,id_mag,id_doi,id_arxiv,source_url,openaccess_status,license,publication_year,publication_date
0,46791531,2755160340.0,10.1364/oe.25.023899,,https://doi.org/10.1364/OE.25.023899,GOLD,CCBY,2017,2017-10-02
1,13140185,2010414254.0,10.3390/rs6031863,,https://doi.org/10.3390/rs6031863,GOLD,CCBY,2014,2014-02-28
2,253366618,,10.1016/j.xpro.2022.101803,,https://pmc.ncbi.nlm.nih.gov/articles/PMC9641055,GOLD,CCBYNCND,2022,2022-11-04


Unnamed: 0,c
0,11609787


 :: "02_stg".stg_semanticscholar_combined_works_metadata queries finished | since_start: 15.13 seconds | since_last: 9.28 seconds :: 


' :: "02_stg".stg_semanticscholar_combined_works_metadata queries finished | since_start: 15.13 seconds | since_last: 9.28 seconds :: '

In [5]:
table_name = 'base_arxiv_metadata'
timelogger = utils.TimeLogger()
utils.create_table_from_sql_file(
    database_name = '02_stg',
    table_name = table_name,
    overwrite_strategy='overwrite', # options: fail, overwrite, ignore
    wait=True,
)
timelogger.log(f'"02_stg".{table_name} created')

utils.pd_set_options(cols=500)
display(wr.athena.read_sql_query(f"""SELECT * FROM "02_stg".{table_name} LIMIT 3 """, '02_stg'))
display(wr.athena.read_sql_query(f"""SELECT COUNT(*) AS c FROM "02_stg".{table_name} """, '02_stg'))
timelogger.log(f'"02_stg".{table_name} queries finished')

 :: :: TIMELOGGER STARTED :: | since_start: 0.00 seconds | since_last: 0.00 seconds :: 
Table 02_stg.base_arxiv_metadata already exists. Overwriting since overwrite_strategy=="overwrite".
Deleting table from Glue Catalog 02_stg base_arxiv_metadata
Deleting S3 objects from s3://sagemaker-research-methodology-extraction/01_data/02_stg/base_arxiv_metadata/
s3_parent_target_path:  s3://sagemaker-research-methodology-extraction/01_data/02_stg
 :: "02_stg".base_arxiv_metadata created | since_start: 15.44 seconds | since_last: 15.44 seconds :: 


Unnamed: 0,id_arxiv,id_doi,title,abstract,license
0,903.1601,,Parabolic-Dish Solar Concentrators of Film on Foam,"Parabolic and spherical mirrors are constructed of aluminized PET polyester film on urethane foam. During construction, the chosen shape of the mirror is created by manipulating the elastic/plastic behavior of the film with air pressure. Foam is then applied to the film and, once hardened, air pressure is removed. At an f-number of 0.68, preliminary models have an optical angular spread of less than 0.25 degrees, a factor of 3.3 smaller than that for a perfectly spherical mirror. The possi...",ArXiv nonexclusive-distrib
1,903.1604,10.3842/SIGMA.2009.029,Limits of Gaudin Systems: Classical and Quantum Cases,"We consider the XXX homogeneous Gaudin system with $N$ sites, both in classical and the quantum case. In particular we show that a suitable limiting procedure for letting the poles of its Lax matrix collide can be used to define new families of Liouville integrals (in the classical case) and new ""Gaudin"" algebras (in the quantum case). We will especially treat the case of total collisions, that gives rise to (a generalization of) the so called Bending flows of Kapovich and Millson. Some as...",CCBYNCSA
2,903.16,,Typically Real Harmonic Functions,We consider a class $\THO$ of typically real harmonic functions on the unit disk that contains the class of normalized analytic and typically real functions. We also obtain some partial results about the region of univalence for this class.,ArXiv nonexclusive-distrib


Unnamed: 0,c
0,2816721


 :: "02_stg".base_arxiv_metadata queries finished | since_start: 24.78 seconds | since_last: 9.35 seconds :: 


' :: "02_stg".base_arxiv_metadata queries finished | since_start: 24.78 seconds | since_last: 9.35 seconds :: '

In [21]:
table_name = 'stg_unified_works_metadata_01_joined_to_arxiv'
timelogger = utils.TimeLogger()
utils.create_table_from_sql_file(
    database_name = '02_stg',
    table_name = table_name,
    overwrite_strategy='overwrite', # options: fail, overwrite, ignore
    wait=True,
)
timelogger.log(f'"02_stg".{table_name} created')

utils.pd_set_options(cols=300)
display(wr.athena.read_sql_query(f"""SELECT * FROM "02_stg".{table_name} LIMIT 3 """, '02_stg'))
display(wr.athena.read_sql_query(f"""SELECT COUNT(*) AS c FROM "02_stg".{table_name} """, '02_stg'))
timelogger.log(f'"02_stg".{table_name} queries finished')

 :: :: TIMELOGGER STARTED :: | since_start: 0.00 seconds | since_last: 0.00 seconds :: 
Deleting S3 objects from s3://sagemaker-research-methodology-extraction/01_data/02_stg/stg_unified_works_metadata_01_joined_to_arxiv/
s3_parent_target_path:  s3://sagemaker-research-methodology-extraction/01_data/02_stg
 :: "02_stg".stg_unified_works_metadata_01_joined_to_arxiv created | since_start: 11.79 seconds | since_last: 11.79 seconds :: 


Unnamed: 0,id_semanticscholar,id_mag,id_doi,id_arxiv,source_url,openaccess_status,publication_year,publication_date,license,license_allows_derivative_reuse
0,4951032,2789407911,10.1016/j.tecto.2018.03.010,,https://doi.org/10.1016/J.TECTO.2018.03.010,HYBRID,2018,2018-04-22,CCBY,1
1,204923380,2981732074,10.1016/j.jalz.2019.08.201,,https://pmc.ncbi.nlm.nih.gov/articles/PMC7012375,HYBRID,2019,2019-10-28,CCBYNCND,0
2,14519185,2114693455,10.1099/vir.0.007377-0,,https://pmc.ncbi.nlm.nih.gov/articles/PMC2885064,HYBRID,2009,2009-03-01,CCBY,1


Unnamed: 0,c
0,11609787


 :: "02_stg".stg_unified_works_metadata_01_joined_to_arxiv queries finished | since_start: 21.14 seconds | since_last: 9.35 seconds :: 


' :: "02_stg".stg_unified_works_metadata_01_joined_to_arxiv queries finished | since_start: 21.14 seconds | since_last: 9.35 seconds :: '

In [15]:
table_name = 'stg_unified_works_metadata_02_joined_to_openalex'
timelogger = utils.TimeLogger()
utils.create_table_from_sql_file(
    database_name = '02_stg',
    table_name = table_name,
    overwrite_strategy='overwrite', # options: fail, overwrite, ignore
    wait=True,
)
timelogger.log(f'"02_stg".{table_name} created')

utils.pd_set_options(cols=300)
display(wr.athena.read_sql_query(f"""SELECT * FROM "02_stg".{table_name} LIMIT 3 """, '02_stg'))
display(wr.athena.read_sql_query(f"""SELECT COUNT(*) AS c FROM "02_stg".{table_name} """, '02_stg'))
timelogger.log(f'"02_stg".{table_name} queries finished')

 :: :: TIMELOGGER STARTED :: | since_start: 0.00 seconds | since_last: 0.00 seconds :: 
Deleting S3 objects from s3://sagemaker-research-methodology-extraction/01_data/02_stg/stg_unified_works_metadata_02_joined_to_openalex/
s3_parent_target_path:  s3://sagemaker-research-methodology-extraction/01_data/02_stg
 :: "02_stg".stg_unified_works_metadata_02_joined_to_openalex created | since_start: 9.75 seconds | since_last: 9.75 seconds :: 


Unnamed: 0,id_semanticscholar,id_mag,id_doi,id_arxiv,publication_year,publication_date,license,license_allows_derivative_reuse,source_url,has_id_mag,has_id_doi,has_id_mag_or_doi,openalex_id_openalex,openalex_id_doi,openalex_language,openalex_primary_topic_id,openalex_primary_topic_display_name,openalex_primary_topic_subfield_id,openalex_primary_topic_subfield_display_name,openalex_primary_topic_field_id,openalex_primary_topic_field_display_name,openalex_primary_topic_domain_id,openalex_primary_topic_domain_display_name,openalex_joined_on
0,239285849,,10.3991/ijet.v16i18.24251,,2021,2021-09-20,CCBY,1,https://doi.org/10.3991/ijet.v16i18.24251,0,1,1,3200054761,10.3991/ijet.v16i18.24251,en,11122,Online Learning and Analytics,1706,Computer Science Applications,17,Computer Science,3,Physical Sciences,doi
1,248062035,,10.3390/rs14071753,,2022,2022-04-06,CCBY,1,https://doi.org/10.3390/rs14071753,0,1,1,4224245753,10.3390/rs14071753,en,11276,Solar Radiation and Photovoltaics,1702,Artificial Intelligence,17,Computer Science,3,Physical Sciences,doi
2,269481513,,10.1109/tvcg.2024.3395365,,2024,2024-04-30,CCBYNCND,0,https://doi.org/10.1109/TVCG.2024.3395365,0,1,1,4396523158,10.1109/tvcg.2024.3395365,en,10734,Information and Cyber Security,1710,Information Systems,17,Computer Science,3,Physical Sciences,doi


Unnamed: 0,c
0,833261


 :: "02_stg".stg_unified_works_metadata_02_joined_to_openalex queries finished | since_start: 19.15 seconds | since_last: 9.40 seconds :: 


' :: "02_stg".stg_unified_works_metadata_02_joined_to_openalex queries finished | since_start: 19.15 seconds | since_last: 9.40 seconds :: '

In [16]:
table_name = 'stg_unified_works_metadata_03_filtered_and_tagged'
db_name = '02_stg'
timelogger = utils.TimeLogger()
utils.create_table_from_sql_file(
    database_name = db_name,
    table_name = table_name,
    overwrite_strategy='overwrite', # options: fail, overwrite, ignore
    wait=True,
)
timelogger.log(f'"{db_name}".{table_name} created')

utils.pd_set_options(cols=300)
display(wr.athena.read_sql_query(f"""SELECT * FROM "{db_name}".{table_name} LIMIT 3 """, db_name))
display(wr.athena.read_sql_query(f"""SELECT COUNT(*) AS c FROM "{db_name}".{table_name} """, db_name))
timelogger.log(f'"{db_name}".{table_name} queries finished')

 :: :: TIMELOGGER STARTED :: | since_start: 0.00 seconds | since_last: 0.00 seconds :: 
Table 02_stg.stg_unified_works_metadata_03_filtered_and_tagged already exists. Overwriting since overwrite_strategy=="overwrite".
Deleting table from Glue Catalog 02_stg stg_unified_works_metadata_03_filtered_and_tagged
Deleting S3 objects from s3://sagemaker-research-methodology-extraction/01_data/02_stg/stg_unified_works_metadata_03_filtered_and_tagged/
s3_parent_target_path:  s3://sagemaker-research-methodology-extraction/01_data/02_stg
 :: "02_stg".stg_unified_works_metadata_03_filtered_and_tagged created | since_start: 6.93 seconds | since_last: 6.93 seconds :: 


Unnamed: 0,id_semanticscholar,id_mag,id_doi,id_arxiv,publication_year,publication_date,license,license_allows_derivative_reuse,source_url,has_id_mag,has_id_doi,has_id_mag_or_doi,openalex_id_openalex,openalex_id_doi,openalex_language,openalex_primary_topic_id,openalex_primary_topic_display_name,openalex_primary_topic_subfield_id,openalex_primary_topic_subfield_display_name,openalex_primary_topic_field_id,openalex_primary_topic_field_display_name,openalex_primary_topic_domain_id,openalex_primary_topic_domain_display_name,openalex_joined_on,bucket_10p,subset
0,2557485,2054117411.0,10.1155/2013/913038,,2013,2013-07-18,CCBY,1,https://pmc.ncbi.nlm.nih.gov/articles/PMC3732635,1,1,1,2054117411,10.1155/2013/913038,en,10054,Parallel Computing and Optimization Techniques,1708,Hardware and Architecture,17,Computer Science,3,Physical Sciences,doi,1,test
1,272597057,,10.46586/tches.v2024.i4.231-257,,2024,,CCBY,1,https://doi.org/10.46586/tches.v2024.i4.231-257,0,1,1,4402807365,10.46586/tches.v2024.i4.231-257,en,10054,Parallel Computing and Optimization Techniques,1708,Hardware and Architecture,17,Computer Science,3,Physical Sciences,doi,1,test
2,7807512,1989540889.0,10.1155/1995/278064,,1995,,CCBY,1,https://doi.org/10.1155/1995/278064,1,1,1,1989540889,10.1155/1995/278064,en,10054,Parallel Computing and Optimization Techniques,1708,Hardware and Architecture,17,Computer Science,3,Physical Sciences,doi,1,test


Unnamed: 0,c
0,396052


 :: "02_stg".stg_unified_works_metadata_03_filtered_and_tagged queries finished | since_start: 15.58 seconds | since_last: 8.65 seconds :: 


' :: "02_stg".stg_unified_works_metadata_03_filtered_and_tagged queries finished | since_start: 15.58 seconds | since_last: 8.65 seconds :: '

In [4]:
table_name = 'stg_topics'
db_name = '02_stg'
timelogger = utils.TimeLogger()
utils.create_table_from_sql_file(
    database_name = db_name,
    table_name = table_name,
    overwrite_strategy='overwrite', # options: fail, overwrite, ignore
    wait=True,
)
timelogger.log(f'"{db_name}".{table_name} created')

utils.pd_set_options(cols=300)
display(wr.athena.read_sql_query(f"""SELECT * FROM "{db_name}".{table_name} ORDER BY openalex_primary_topic_count DESC LIMIT 3 """, db_name))
display(wr.athena.read_sql_query(f"""SELECT COUNT(*) AS c FROM "{db_name}".{table_name} """, db_name))
timelogger.log(f'"{db_name}".{table_name} queries finished')

 :: :: TIMELOGGER STARTED :: | since_start: 0.00 seconds | since_last: 0.00 seconds :: 
Table 02_stg.stg_topics already exists. Overwriting since overwrite_strategy=="overwrite".
Deleting table from Glue Catalog 02_stg stg_topics
Deleting S3 objects from s3://sagemaker-research-methodology-extraction/01_data/02_stg/stg_topics/
s3_parent_target_path:  s3://sagemaker-research-methodology-extraction/01_data/02_stg
 :: "02_stg".stg_topics created | since_start: 5.94 seconds | since_last: 5.94 seconds :: 


Unnamed: 0,openalex_primary_topic_index,openalex_primary_topic_id,openalex_primary_topic_display_name,openalex_primary_topic_count,openalex_primary_topic_percent,openalex_primary_topic_subfield_id,openalex_primary_topic_subfield_display_name
0,0,10028,Topic Modeling,18722,4.727157,1702,Artificial Intelligence
1,1,10181,Natural Language Processing Techniques,14653,3.699767,1702,Artificial Intelligence
2,2,10211,Computational Drug Discovery Methods,11760,2.969307,1703,Computational Theory and Mathematics


Unnamed: 0,c
0,302


 :: "02_stg".stg_topics queries finished | since_start: 13.88 seconds | since_last: 7.94 seconds :: 


' :: "02_stg".stg_topics queries finished | since_start: 13.88 seconds | since_last: 7.94 seconds :: '

In [5]:
table_name = 'stg_subfields'
db_name = '02_stg'
timelogger = utils.TimeLogger()
utils.create_table_from_sql_file(
    database_name = db_name,
    table_name = table_name,
    overwrite_strategy='overwrite', # options: fail, overwrite, ignore
    wait=True,
)
timelogger.log(f'"{db_name}".{table_name} created')

utils.pd_set_options(cols=300)
display(wr.athena.read_sql_query(f"""SELECT * FROM "{db_name}".{table_name} ORDER BY openalex_primary_topic_subfield_count DESC LIMIT 3 """, db_name))
display(wr.athena.read_sql_query(f"""SELECT COUNT(*) AS c FROM "{db_name}".{table_name} """, db_name))
timelogger.log(f'"{db_name}".{table_name} queries finished')

 :: :: TIMELOGGER STARTED :: | since_start: 0.00 seconds | since_last: 0.00 seconds :: 
Table 02_stg.stg_subfields already exists. Overwriting since overwrite_strategy=="overwrite".
Deleting table from Glue Catalog 02_stg stg_subfields
Deleting S3 objects from s3://sagemaker-research-methodology-extraction/01_data/02_stg/stg_subfields/
s3_parent_target_path:  s3://sagemaker-research-methodology-extraction/01_data/02_stg
 :: "02_stg".stg_subfields created | since_start: 3.56 seconds | since_last: 3.56 seconds :: 


Unnamed: 0,openalex_primary_topic_subfield_index,openalex_primary_topic_subfield_id,openalex_primary_topic_subfield_display_name,openalex_primary_topic_subfield_count,openalex_primary_topic_subfield_percent
0,0,1702,Artificial Intelligence,151180,38.171755
1,1,1707,Computer Vision and Pattern Recognition,64947,16.398604
2,2,1710,Information Systems,54555,13.774706


Unnamed: 0,c
0,11


 :: "02_stg".stg_subfields queries finished | since_start: 10.82 seconds | since_last: 7.26 seconds :: 


' :: "02_stg".stg_subfields queries finished | since_start: 10.82 seconds | since_last: 7.26 seconds :: '

In [6]:
table_name = 'stg_unified_works_metadata_04_with_topics_and_subfields'
db_name = '02_stg'
timelogger = utils.TimeLogger()
utils.create_table_from_sql_file(
    database_name = db_name,
    table_name = table_name,
    overwrite_strategy='overwrite', # options: fail, overwrite, ignore
    wait=True,
)
timelogger.log(f'"{db_name}".{table_name} created')

utils.pd_set_options(cols=300)
display(wr.athena.read_sql_query(f"""SELECT * FROM "{db_name}".{table_name} ORDER BY RANDOM() LIMIT 3 """, db_name))
display(wr.athena.read_sql_query(f"""SELECT COUNT(*) AS c FROM "{db_name}".{table_name} """, db_name))
timelogger.log(f'"{db_name}".{table_name} queries finished')

 :: :: TIMELOGGER STARTED :: | since_start: 0.00 seconds | since_last: 0.00 seconds :: 
Table 02_stg.stg_unified_works_metadata_04_with_topics_and_subfields already exists. Overwriting since overwrite_strategy=="overwrite".
Deleting table from Glue Catalog 02_stg stg_unified_works_metadata_04_with_topics_and_subfields
Deleting S3 objects from s3://sagemaker-research-methodology-extraction/01_data/02_stg/stg_unified_works_metadata_04_with_topics_and_subfields/
s3_parent_target_path:  s3://sagemaker-research-methodology-extraction/01_data/02_stg
 :: "02_stg".stg_unified_works_metadata_04_with_topics_and_subfields created | since_start: 8.74 seconds | since_last: 8.74 seconds :: 


Unnamed: 0,id_semanticscholar,id_mag,id_doi,id_arxiv,publication_year,publication_date,license,license_allows_derivative_reuse,source_url,has_id_mag,has_id_doi,has_id_mag_or_doi,openalex_id_openalex,openalex_id_doi,openalex_language,openalex_primary_topic_id,openalex_primary_topic_display_name,openalex_primary_topic_count,openalex_primary_topic_index,openalex_primary_topic_subfield_id,openalex_primary_topic_subfield_display_name,openalex_primary_topic_subfield_count,openalex_primary_topic_subfield_index,openalex_primary_topic_field_id,openalex_primary_topic_field_display_name,openalex_primary_topic_domain_id,openalex_primary_topic_domain_display_name,openalex_joined_on,subset
0,276951401,,10.1109/access.2025.3549781,,2025,,CCBY,1,https://doi.org/10.1109/ACCESS.2025.3549781,0,1,1,4408323703,10.1109/access.2025.3549781,en,10028,Topic Modeling,18722,0,1702,Artificial Intelligence,151180,0,17,Computer Science,3,Physical Sciences,doi,train
1,274820729,,10.1038/s41598-024-81151-1,,2024,2024-12-17,CCBY,1,https://pmc.ncbi.nlm.nih.gov/articles/PMC11652632,0,1,1,4405499124,10.1038/s41598-024-81151-1,en,11512,Anomaly Detection Techniques and Applications,3845,14,1702,Artificial Intelligence,151180,0,17,Computer Science,3,Physical Sciences,doi,train
2,257084396,,10.1038/s41597-023-01985-8,2303.14884,2023,2023-02-23,CCBY,1,https://pmc.ncbi.nlm.nih.gov/articles/PMC9950383,0,1,1,4321615127,10.1038/s41597-023-01985-8,en,10601,Handwritten Text Recognition Techniques,1658,75,1707,Computer Vision and Pattern Recognition,64947,1,17,Computer Science,3,Physical Sciences,doi,train


Unnamed: 0,c
0,396052


 :: "02_stg".stg_unified_works_metadata_04_with_topics_and_subfields queries finished | since_start: 17.47 seconds | since_last: 8.72 seconds :: 


' :: "02_stg".stg_unified_works_metadata_04_with_topics_and_subfields queries finished | since_start: 17.47 seconds | since_last: 8.72 seconds :: '

In [8]:
table_name = 'stg_unified_works_filtered'
timelogger = utils.TimeLogger()
utils.create_table_from_sql_file(
    database_name = '02_stg',
    table_name = table_name,
    overwrite_strategy='overwrite', # options: fail, overwrite, ignore
    wait=True,
)
timelogger.log(f'"02_stg".{table_name} created')

utils.pd_set_options(cols=300)
display(wr.athena.read_sql_query(f"""SELECT * FROM "02_stg".{table_name} LIMIT 3 """, '02_stg'))
display(wr.athena.read_sql_query(f"""SELECT COUNT(*) AS c FROM "02_stg".{table_name} """, '02_stg'))
timelogger.log(f'"02_stg".{table_name} queries finished')

 :: :: TIMELOGGER STARTED :: | since_start: 0.00 seconds | since_last: 0.00 seconds :: 
Table 02_stg.stg_unified_works_filtered already exists. Overwriting since overwrite_strategy=="overwrite".
Deleting table from Glue Catalog 02_stg stg_unified_works_filtered
Deleting S3 objects from s3://sagemaker-research-methodology-extraction/01_data/02_stg/stg_unified_works_filtered/
s3_parent_target_path:  s3://sagemaker-research-methodology-extraction/01_data/02_stg
 :: "02_stg".stg_unified_works_filtered created | since_start: 1.0 minute, 14.65 seconds | since_last: 1.0 minute, 14.65 seconds :: 


Unnamed: 0,id_semanticscholar,id_mag,id_doi,id_arxiv,publication_year,publication_date,license,license_allows_derivative_reuse,source_url,has_id_mag,has_id_doi,has_id_mag_or_doi,openalex_id_openalex,openalex_id_doi,openalex_language,openalex_primary_topic_id,openalex_primary_topic_display_name,openalex_primary_topic_count,openalex_primary_topic_index,openalex_primary_topic_subfield_id,openalex_primary_topic_subfield_display_name,openalex_primary_topic_subfield_count,openalex_primary_topic_subfield_index,openalex_primary_topic_field_id,openalex_primary_topic_field_display_name,openalex_primary_topic_domain_id,openalex_primary_topic_domain_display_name,openalex_joined_on,title,content_abstract,content_text,annotations_paragraph,annotations_section_header,subset
0,195564019,2948466910,10.19105/ojbs.v13i1.2262,,2019,2019-05-31,CCBYNC,1,https://doi.org/10.19105/OJBS.V13I1.2262,1,1,1,2948466910,10.19105/ojbs.v13i1.2262,en,14516,English Language Learning and Teaching,1518,86,1710,Information Systems,54555,2,17,Computer Science,3,Physical Sciences,doi,The Effectiveness of Tourist Hunting Project in Improving Students’ English Communication Skill,Language practice through direct communication will develop students’ English Communication Skills especially when they practice it with native English speakers. This study aims to find students’ level of interest in performing Tourist Hunting Project (THP) and the effectiveness of THP in improv...,"\nA. Introduction\n\n2 Zoltán Dörnyei and Ema Ushioda, Teaching and find efficient and effective methods for mastering it. There are various ways of English learning and one approach that might be considered to improve the English proficiency is through proper communication practice. \n\nLanguag...","[{""attributes"":null,""end"":282,""start"":18},{""attributes"":null,""end"":571,""start"":284},{""attributes"":null,""end"":1461,""start"":573},{""attributes"":null,""end"":1678,""start"":1463},{""attributes"":null,""end"":2010,""start"":1680},{""attributes"":null,""end"":2235,""start"":2045},{""attributes"":null,""end"":4199,""start""...","[{""attributes"":null,""end"":16,""start"":1},{""attributes"":null,""end"":2043,""start"":2012},{""attributes"":null,""end"":5410,""start"":5401},{""attributes"":{""n"":""2.""},""end"":5697,""start"":5683},{""attributes"":null,""end"":5911,""start"":5890}]",test
1,64026328,2343903314,10.2991/icassr-15.2016.128,,2016,2016-08-01,CCBYNC,1,https://doi.org/10.2991/ICASSR-15.2016.128,1,1,1,2343903314,10.2991/icassr-15.2016.128,en,14430,Higher Education and Teaching Methods,569,188,1710,Information Systems,54555,2,17,Computer Science,3,Physical Sciences,doi,University computer-based teaching model of reflection and exploration,"Used to reflect the concept of computer-based teaching, the teaching body to explore a conversion, that is possible to provide students with active development of space, students have continued the development of student ability, problem-solving teaching practical skills. Index Terms - Computer ...","\nIntroduction\n\nIn recent years, the reform of teaching contents and curriculum have made significant achievements, but to improve teaching methods, teaching procedures and teaching methods, progress is slow. Obviously, we can no longer teaching the traditional subjects as well-organized and d...","[{""attributes"":null,""end"":598,""start"":15},{""attributes"":null,""end"":737,""start"":645},{""attributes"":null,""end"":1116,""start"":811},{""attributes"":null,""end"":1417,""start"":1171},{""attributes"":null,""end"":2404,""start"":1480},{""attributes"":null,""end"":2948,""start"":2406},{""attributes"":null,""end"":3417,""start""...","[{""attributes"":{""n"":""1.""},""end"":13,""start"":1},{""attributes"":{""n"":""2.""},""end"":643,""start"":600},{""attributes"":null,""end"":809,""start"":739},{""attributes"":null,""end"":1169,""start"":1118},{""attributes"":null,""end"":1478,""start"":1419},{""attributes"":null,""end"":3030,""start"":2950},{""attributes"":{""n"":""3.""},""en...",train
2,73438169,2912765151,10.3758/s13428-019-01201-9,,2019,2019-02-13,CCBY,1,https://pmc.ncbi.nlm.nih.gov/articles/PMC6691032,1,1,1,2912765151,10.3758/s13428-019-01201-9,en,13083,Advanced Text Analysis Techniques,1542,84,1702,Artificial Intelligence,151180,0,17,Computer Science,3,Physical Sciences,doi,Identifying category representations for complex stimuli using discrete Markov chain Monte Carlo with people,"With the explosion of “big data,” digital repositories of texts and images are growing rapidly. These datasets present new opportunities for psychological research, but they require new methodologies before researchers can use these datasets to yield insights into human cognition. We present a n...","\nBig data is transforming society and offers a significant opportunity for psychology research: Large databases of text and images are potentially rich resources for understanding human cognition. However, the current usefulness of big data in psychology is limited, in part because of its size-...","[{""attributes"":null,""end"":2068,""start"":1},{""attributes"":null,""end"":3409,""start"":2070},{""attributes"":null,""end"":4031,""start"":3411},{""attributes"":null,""end"":4856,""start"":4033},{""attributes"":null,""end"":5493,""start"":4896},{""attributes"":null,""end"":6711,""start"":5495},{""attributes"":null,""end"":7646,""sta...","[{""attributes"":null,""end"":4894,""start"":4858},{""attributes"":null,""end"":15379,""start"":15346},{""attributes"":null,""end"":15894,""start"":15888},{""attributes"":null,""end"":19483,""start"":19476},{""attributes"":null,""end"":22636,""start"":22605},{""attributes"":null,""end"":23543,""start"":23537},{""attributes"":null,""e...",train


Unnamed: 0,c
0,388735


 :: "02_stg".stg_unified_works_filtered queries finished | since_start: 1.0 minute, 24.97 seconds | since_last: 10.31 seconds :: 


' :: "02_stg".stg_unified_works_filtered queries finished | since_start: 1.0 minute, 24.97 seconds | since_last: 10.31 seconds :: '

In [9]:
table_name = 'unified_works'
db_name = '03_core'
timelogger = utils.TimeLogger()
utils.create_table_from_sql_file(
    database_name = db_name,
    table_name = table_name,
    overwrite_strategy='overwrite', # options: fail, overwrite, ignore
    wait=True,
    s3_parent_target_path=config.S3_CORE_DATA_PATH
)
timelogger.log(f'"{db_name}".{table_name} created')

utils.pd_set_options(cols=100)
display(wr.athena.read_sql_query(f"""SELECT * FROM "{db_name}".{table_name} LIMIT 3 """, db_name))
display(wr.athena.read_sql_query(f"""SELECT COUNT(*) AS c FROM "{db_name}".{table_name} """, db_name))
timelogger.log(f'"{db_name}".{table_name} queries finished')

 :: :: TIMELOGGER STARTED :: | since_start: 0.00 seconds | since_last: 0.00 seconds :: 
Table 03_core.unified_works already exists. Overwriting since overwrite_strategy=="overwrite".
Deleting table from Glue Catalog 03_core unified_works
Deleting S3 objects from s3://sagemaker-research-methodology-extraction/01_data/03_core/unified_works/
s3_parent_target_path:  s3://sagemaker-research-methodology-extraction/01_data/03_core
 :: "03_core".unified_works created | since_start: 39.83 seconds | since_last: 39.83 seconds :: 


Unnamed: 0,id,topic_index,topic_display_name,subfield_index,subfield_display_name,title,abstract,fulltext,subset
0,126230559,112,semigroups and automata theory,4,Computational Theory and Mathematics,Algebraic properties of Zappa–Szép products of semigroups and monoids,"Direct, semidirect and Zappa–Szép products provide tools to decompose algebraic structures, with...",\nIntroduction\n\nSemidirect products and their generalisation Zappa-Szép products have become w...,train
1,57189425,112,semigroups and automata theory,4,Computational Theory and Mathematics,Undecidability of a weak version of MSO+U,We prove the undecidability of MSO on $\omega$-words extended with the second-order predicate $U...,\nIntroduction\n\nThis paper is about monadic second-order logic (mso) on ω-words. Büchi's famou...,train
2,32354455,112,semigroups and automata theory,4,Computational Theory and Mathematics,Minimizing Resources of Sweeping and Streaming String Transducers,We consider minimization problems for natural parameters of word transducers: the number of pass...,"\nIntroduction\n\nRegular word functions extend the robust family of regular languages, preservi...",train


Unnamed: 0,c
0,388735


 :: "03_core".unified_works queries finished | since_start: 48.46 seconds | since_last: 8.63 seconds :: 


' :: "03_core".unified_works queries finished | since_start: 48.46 seconds | since_last: 8.63 seconds :: '

In [10]:
timelogger = utils.TimeLogger()
db_name = '03_core'
table_names = ['unified_works_train', 'unified_works_test', 'unified_works_validation']
for table_name in table_names:
    utils.create_table_from_sql_file(
        database_name = db_name,
        table_name = table_name,
        overwrite_strategy='overwrite', # options: fail, overwrite, ignore
        wait=True,
        s3_parent_target_path=config.S3_CORE_DATA_PATH
    )
    timelogger.log(f'"{db_name}".{table_name} created')
    
    utils.pd_set_options(cols=100)
    display(wr.athena.read_sql_query(f"""SELECT * FROM "{db_name}".{table_name} LIMIT 3 """, db_name))
    display(wr.athena.read_sql_query(f"""SELECT COUNT(*) AS c FROM "{db_name}".{table_name} """, db_name))
    timelogger.log(f'"{db_name}".{table_name} queries finished')

 :: :: TIMELOGGER STARTED :: | since_start: 0.00 seconds | since_last: 0.00 seconds :: 
Table 03_core.unified_works_train already exists. Overwriting since overwrite_strategy=="overwrite".
Deleting table from Glue Catalog 03_core unified_works_train
Deleting S3 objects from s3://sagemaker-research-methodology-extraction/01_data/03_core/unified_works_train/
s3_parent_target_path:  s3://sagemaker-research-methodology-extraction/01_data/03_core
 :: "03_core".unified_works_train created | since_start: 36.08 seconds | since_last: 36.08 seconds :: 


Unnamed: 0,id,topic_index,topic_display_name,subfield_index,subfield_display_name,title,abstract,fulltext,subset
0,247788431,84,Advanced Text Analysis Techniques,0,Artificial Intelligence,Research on Keyword Extraction Algorithm in English Text Based on Cluster Analysis,How to facilitate users to quickly and accurately search for the text information they need is a...,"\nIntroduction\n\nWith the advancement of information technology, mankind is carrying out the bi...",train
1,271329605,84,Advanced Text Analysis Techniques,0,Artificial Intelligence,Citation network analysis for viewpoint plurality assessment of historical corpora: The case of ...,"Citation networks enable analysis of author groups, defining in-group dynamics, and mapping out ...","\nIntroduction\n\nIn the past decades, citation analysis has been widely utilized in the fields ...",train
2,182953250,84,Advanced Text Analysis Techniques,0,Artificial Intelligence,Neural Keyphrase Generation via Reinforcement Learning with Adaptive Rewards,Generating keyphrases that summarize the main points of a document is a fundamental task in natu...,\nIntroduction\n\nThe task of keyphrase generation aims at predicting a set of keyphrases that c...,train


Unnamed: 0,c
0,310751


 :: "03_core".unified_works_train queries finished | since_start: 44.76 seconds | since_last: 8.69 seconds :: 
Table 03_core.unified_works_test already exists. Overwriting since overwrite_strategy=="overwrite".
Deleting table from Glue Catalog 03_core unified_works_test
Deleting S3 objects from s3://sagemaker-research-methodology-extraction/01_data/03_core/unified_works_test/
s3_parent_target_path:  s3://sagemaker-research-methodology-extraction/01_data/03_core
 :: "03_core".unified_works_test created | since_start: 54.14 seconds | since_last: 9.38 seconds :: 


Unnamed: 0,id,topic_index,topic_display_name,subfield_index,subfield_display_name,title,abstract,fulltext,subset
0,7799017,60,Optimization and Variational Analysis,4,Computational Theory and Mathematics,Convergence of two-step iterative scheme with errors for two asymptotically nonexpansive mappings,A two-step iterative scheme with errors has been studied to approximate the common fixed points ...,"\nIntroduction.\n\nIn 1995, Liu [4] introduced iterative schemes with errors as follows. \n\n(a)...",test
1,234050919,60,Optimization and Variational Analysis,4,Computational Theory and Mathematics,Generalized Complementarity Problem with Three Classes of Generalized Variational Inequalities I...,"In this study, we introduce and study a generalized complementarity problem involving XOR operat...",\nIntroduction\n\nIt is well known that the many unrelated free boundary value problems related ...,test
2,268539518,60,Optimization and Variational Analysis,4,Computational Theory and Mathematics,Effective Rates for Iterations Involving Bregman Strongly Nonexpansive Operators,We develop the theory of Bregman strongly nonexpansive maps for uniformly Fréchet differentiable...,\nIntroduction\n\nMonotone set-valued operators A : X → 2 X in a Hilbert space X are usually stu...,test


Unnamed: 0,c
0,39017


 :: "03_core".unified_works_test queries finished | since_start: 1.0 minute, 3.26 seconds | since_last: 9.12 seconds :: 
Table 03_core.unified_works_validation already exists. Overwriting since overwrite_strategy=="overwrite".
Deleting table from Glue Catalog 03_core unified_works_validation
Deleting S3 objects from s3://sagemaker-research-methodology-extraction/01_data/03_core/unified_works_validation/
s3_parent_target_path:  s3://sagemaker-research-methodology-extraction/01_data/03_core
 :: "03_core".unified_works_validation created | since_start: 1.0 minute, 12.67 seconds | since_last: 9.41 seconds :: 


Unnamed: 0,id,topic_index,topic_display_name,subfield_index,subfield_display_name,title,abstract,fulltext,subset
0,252821322,191,E-Learning and Knowledge Management,7,Computer Science Applications,Construction of artificial intelligence-assisted English learning resource query system,English has become an important tool for China's opening to the outside world and exchanges with...,"\nIntroduction\n\nAt present, English learning websites are full of money, and many websites hav...",validation
1,181568039,191,E-Learning and Knowledge Management,7,Computer Science Applications,Research-based learning: a case study for engineering students,An implementation of the research-based learning (RBL) model and methodologies for undergraduate...,\nIntroduction\n\nResearch-based learning (RBL) aims to promote and develop student competencies...,validation
2,233285309,218,Multimedia Learning Systems,2,Information Systems,A Study of Irrigation Performance Index and Real Cost Value of Irrigation Operations and Mainten...,Irrigation Asset Management is required to maintain the value of irrigation asset function and c...,\nIntroduction\n\nIrrigation is a crucial factor for maximizing the potential of agriculture pro...,validation


Unnamed: 0,c
0,38967


 :: "03_core".unified_works_validation queries finished | since_start: 1.0 minute, 20.45 seconds | since_last: 7.78 seconds :: 


In [11]:
display(wr.athena.read_sql_query(f"""
    SELECT 
        subset,
        COUNT(*) AS c,
        COUNT(*) * 100.0 / (SELECT COUNT(*) FROM "03_core".unified_works) AS p
    FROM 
        "03_core".unified_works_train
    GROUP BY
        subset
    
    UNION ALL

    SELECT 
        subset,
        COUNT(*) AS c,
        COUNT(*) * 100.0 / (SELECT COUNT(*) FROM "03_core".unified_works) AS p
    FROM 
        "03_core".unified_works_test
    GROUP BY
        subset

    UNION ALL

    SELECT 
        subset,
        COUNT(*) AS c,
        COUNT(*) * 100.0 / (SELECT COUNT(*) FROM "03_core".unified_works) AS p
    FROM 
        "03_core".unified_works_validation
    GROUP BY
        subset
""", db_name))

Unnamed: 0,subset,c,p
0,validation,38967,10.024052
1,test,39017,10.036915
2,train,310751,79.939033


In [12]:
table_name = 'topics'
db_name = '03_core'
timelogger = utils.TimeLogger()
utils.create_table_from_sql_file(
    database_name = db_name,
    table_name = table_name,
    overwrite_strategy='overwrite', # options: fail, overwrite, ignore
    wait=True,
    s3_parent_target_path=config.S3_CORE_DATA_PATH
)
timelogger.log(f'"{db_name}".{table_name} created')

utils.pd_set_options(cols=300)
display(wr.athena.read_sql_query(f"""SELECT * FROM "{db_name}".{table_name} LIMIT 30 """, db_name))
display(wr.athena.read_sql_query(f"""SELECT COUNT(*) AS c FROM "{db_name}".{table_name} """, db_name))
timelogger.log(f'"{db_name}".{table_name} queries finished')

 :: :: TIMELOGGER STARTED :: | since_start: 0.00 seconds | since_last: 0.00 seconds :: 
Table 03_core.topics already exists. Overwriting since overwrite_strategy=="overwrite".
Deleting table from Glue Catalog 03_core topics
Deleting S3 objects from s3://sagemaker-research-methodology-extraction/01_data/03_core/topics/
s3_parent_target_path:  s3://sagemaker-research-methodology-extraction/01_data/03_core
 :: "03_core".topics created | since_start: 3.50 seconds | since_last: 3.50 seconds :: 


Unnamed: 0,topic_index,topic_original_id,topic_display_name,topic_count,subfield_original_id,subfield_display_name
0,0,10028,Topic Modeling,18722,1702,Artificial Intelligence
1,1,10181,Natural Language Processing Techniques,14653,1702,Artificial Intelligence
2,2,10211,Computational Drug Discovery Methods,11760,1703,Computational Theory and Mathematics
3,15,10080,Energy Efficient Wireless Sensor Networks,3819,1705,Computer Networks and Communications
4,16,10664,Sentiment Analysis and Opinion Mining,3817,1702,Artificial Intelligence
5,17,10648,Virtual Reality Applications and Impacts,3512,1709,Human-Computer Interaction
6,18,10331,Video Surveillance and Tracking Methods,3463,1707,Computer Vision and Pattern Recognition
7,19,11714,Multimodal Machine Learning Applications,3423,1707,Computer Vision and Pattern Recognition
8,20,13602,Educational Methods and Media Use,3422,1710,Information Systems
9,21,10260,Software Engineering Research,3274,1710,Information Systems


Unnamed: 0,c
0,302


 :: "03_core".topics queries finished | since_start: 11.14 seconds | since_last: 7.63 seconds :: 


' :: "03_core".topics queries finished | since_start: 11.14 seconds | since_last: 7.63 seconds :: '

In [13]:
table_name = 'subfields'
db_name = '03_core'
timelogger = utils.TimeLogger()
utils.create_table_from_sql_file(
    database_name = db_name,
    table_name = table_name,
    overwrite_strategy='overwrite', # options: fail, overwrite, ignore
    wait=True,
    s3_parent_target_path=config.S3_CORE_DATA_PATH
)
timelogger.log(f'"{db_name}".{table_name} created')

utils.pd_set_options(cols=300)
display(wr.athena.read_sql_query(f"""SELECT * FROM "{db_name}".{table_name} LIMIT 3 """, db_name))
display(wr.athena.read_sql_query(f"""SELECT COUNT(*) AS c FROM "{db_name}".{table_name} """, db_name))
timelogger.log(f'"{db_name}".{table_name} queries finished')

 :: :: TIMELOGGER STARTED :: | since_start: 0.00 seconds | since_last: 0.00 seconds :: 
Table 03_core.subfields already exists. Overwriting since overwrite_strategy=="overwrite".
Deleting table from Glue Catalog 03_core subfields
Deleting S3 objects from s3://sagemaker-research-methodology-extraction/01_data/03_core/subfields/
s3_parent_target_path:  s3://sagemaker-research-methodology-extraction/01_data/03_core
 :: "03_core".subfields created | since_start: 4.61 seconds | since_last: 4.61 seconds :: 


Unnamed: 0,subfield_index,subfield_original_id,subfield_display_name,subfield_count
0,0,1702,Artificial Intelligence,151180
1,1,1707,Computer Vision and Pattern Recognition,64947
2,2,1710,Information Systems,54555


Unnamed: 0,c
0,11


 :: "03_core".subfields queries finished | since_start: 12.27 seconds | since_last: 7.66 seconds :: 


' :: "03_core".subfields queries finished | since_start: 12.27 seconds | since_last: 7.66 seconds :: '

In [14]:
table_name = 'stg_filtered_work_section_annotations'
db_name = '02_stg'
timelogger = utils.TimeLogger()
utils.create_table_from_sql_file(
    database_name=db_name,
    table_name=table_name,
    overwrite_strategy='overwrite', # options: fail, overwrite, ignore
    wait=True,
    # s3_parent_target_path=config.S3_CORE_DATA_PATH
)
timelogger.log(f'"{db_name}".{table_name} created')

utils.pd_set_options(cols=300)
display(wr.athena.read_sql_query(f"""SELECT * FROM "{db_name}".{table_name} LIMIT 3 """, db_name))
display(wr.athena.read_sql_query(f"""SELECT COUNT(*) AS c FROM "{db_name}".{table_name} """, db_name))
timelogger.log(f'"{db_name}".{table_name} queries finished')

 :: :: TIMELOGGER STARTED :: | since_start: 0.00 seconds | since_last: 0.00 seconds :: 
Table 02_stg.stg_filtered_work_section_annotations already exists. Overwriting since overwrite_strategy=="overwrite".
Deleting table from Glue Catalog 02_stg stg_filtered_work_section_annotations
Deleting S3 objects from s3://sagemaker-research-methodology-extraction/01_data/02_stg/stg_filtered_work_section_annotations/
s3_parent_target_path:  s3://sagemaker-research-methodology-extraction/01_data/02_stg
 :: "02_stg".stg_filtered_work_section_annotations created | since_start: 36.89 seconds | since_last: 36.89 seconds :: 


Unnamed: 0,work_id,section_index,section_type_index,n,section_start,section_end,section_type
0,394987,59,48,,24115,24776,paragraph
1,394987,60,49,,24778,24927,paragraph
2,394987,61,50,,24929,25624,paragraph


Unnamed: 0,c
0,34397196


 :: "02_stg".stg_filtered_work_section_annotations queries finished | since_start: 46.07 seconds | since_last: 9.18 seconds :: 


' :: "02_stg".stg_filtered_work_section_annotations queries finished | since_start: 46.07 seconds | since_last: 9.18 seconds :: '

In [15]:
table_name = 'stg_filtered_work_sections'
db_name = '02_stg'
timelogger = utils.TimeLogger()
utils.create_table_from_sql_file(
    database_name=db_name,
    table_name=table_name,
    overwrite_strategy='overwrite', # options: fail, overwrite, ignore
    wait=True,
    # s3_parent_target_path=config.S3_CORE_DATA_PATH
)
timelogger.log(f'"{db_name}".{table_name} created')

utils.pd_set_options(cols=300)
display(wr.athena.read_sql_query(f"""SELECT * FROM "{db_name}".{table_name} LIMIT 3 """, db_name))
display(wr.athena.read_sql_query(f"""SELECT COUNT(*) AS c FROM "{db_name}".{table_name} """, db_name))
timelogger.log(f'"{db_name}".{table_name} queries finished')

 :: :: TIMELOGGER STARTED :: | since_start: 0.00 seconds | since_last: 0.00 seconds :: 
Table 02_stg.stg_filtered_work_sections already exists. Overwriting since overwrite_strategy=="overwrite".
Deleting table from Glue Catalog 02_stg stg_filtered_work_sections
Deleting S3 objects from s3://sagemaker-research-methodology-extraction/01_data/02_stg/stg_filtered_work_sections/
s3_parent_target_path:  s3://sagemaker-research-methodology-extraction/01_data/02_stg
 :: "02_stg".stg_filtered_work_sections created | since_start: 43.66 seconds | since_last: 43.66 seconds :: 


Unnamed: 0,work_id,section_index,section_type_index,n,section_start,section_end,section_type,section_text,section_text_length
0,284264,35,31,,14785,14968,paragraph,The total number of bits in the signature is calculated using the mathematical formula ( 8) and the size of the signature (in byte) is calculated using the mathematical formula ( 9):,182
1,284264,36,32,,14970,15018,paragraph,SizeOfSignature  round (TotalNoOfBits / 8) (9),47
2,284264,37,5,,15020,15055,header,Phase 2: Check Image Authentication,35


Unnamed: 0,c
0,34397196


 :: "02_stg".stg_filtered_work_sections queries finished | since_start: 53.59 seconds | since_last: 9.94 seconds :: 


' :: "02_stg".stg_filtered_work_sections queries finished | since_start: 53.59 seconds | since_last: 9.94 seconds :: '

In [16]:
table_name = 'stg_unified_works_filtered_with_section_stats'
db_name = '02_stg'
timelogger = utils.TimeLogger()
utils.create_table_from_sql_file(
    database_name=db_name,
    table_name=table_name,
    overwrite_strategy='overwrite', # options: fail, overwrite, ignore
    wait=True,
    # s3_parent_target_path=config.S3_CORE_DATA_PATH
)
timelogger.log(f'"{db_name}".{table_name} created')

utils.pd_set_options(cols=300)
display(wr.athena.read_sql_query(f"""SELECT * FROM "{db_name}".{table_name} LIMIT 3 """, db_name))
display(wr.athena.read_sql_query(f"""SELECT COUNT(*) AS c FROM "{db_name}".{table_name} """, db_name))
timelogger.log(f'"{db_name}".{table_name} queries finished')

 :: :: TIMELOGGER STARTED :: | since_start: 0.00 seconds | since_last: 0.00 seconds :: 
Table 02_stg.stg_unified_works_filtered_with_section_stats already exists. Overwriting since overwrite_strategy=="overwrite".
Deleting table from Glue Catalog 02_stg stg_unified_works_filtered_with_section_stats
Deleting S3 objects from s3://sagemaker-research-methodology-extraction/01_data/02_stg/stg_unified_works_filtered_with_section_stats/
s3_parent_target_path:  s3://sagemaker-research-methodology-extraction/01_data/02_stg
 :: "02_stg".stg_unified_works_filtered_with_section_stats created | since_start: 36.62 seconds | since_last: 36.62 seconds :: 


Unnamed: 0,id_semanticscholar,id_mag,id_doi,id_arxiv,publication_year,publication_date,license,license_allows_derivative_reuse,source_url,has_id_mag,has_id_doi,has_id_mag_or_doi,openalex_id_openalex,openalex_id_doi,openalex_language,openalex_primary_topic_id,openalex_primary_topic_display_name,openalex_primary_topic_count,openalex_primary_topic_index,openalex_primary_topic_subfield_id,openalex_primary_topic_subfield_display_name,openalex_primary_topic_subfield_count,openalex_primary_topic_subfield_index,openalex_primary_topic_field_id,openalex_primary_topic_field_display_name,openalex_primary_topic_domain_id,openalex_primary_topic_domain_display_name,openalex_joined_on,title,content_abstract,content_text,annotations_paragraph,annotations_section_header,subset,number_of_sections,number_of_headers,number_of_paragraphs,has_sections,has_headers,has_paragraphs
0,51773146,2805461915.0,10.18653/v1/s18-1017,,2018,2018-06-01,CCBY,1,https://aclanthology.org/S18-1017,1,1,1,2805461915,10.18653/v1/s18-1017,en,10664,Sentiment Analysis and Opinion Mining,3817,16,1702,Artificial Intelligence,151180,0,17,Computer Science,3,Physical Sciences,doi,SINAI at SemEval-2018 Task 1: Emotion Recognition in Tweets,"Emotion classification is a new task that combines several disciplines including Artificial Intelligence and Psychology, although Natural Language Processing is perhaps the most challenging area. In this paper, we describe our participation in SemEval-2018 Task1: Affect in Tweets. In particular,...","\nIntroduction\n\nEmotions are playing a significant role in the effective communication of people. In fact, sometimes, emotional intelligence is more important than cognitive intelligence for successful interaction (Pantic et al., 2005). Therefore, affective computing is a key element to the ad...","[{""attributes"":null,""end"":460,""start"":15},{""attributes"":null,""end"":794,""start"":462},{""attributes"":null,""end"":1015,""start"":796},{""attributes"":null,""end"":1604,""start"":1017},{""attributes"":null,""end"":1951,""start"":1606},{""attributes"":null,""end"":2266,""start"":1999},{""attributes"":null,""end"":2584,""start""...","[{""attributes"":{""n"":""1.""},""end"":13,""start"":1},{""attributes"":null,""end"":1997,""start"":1953},{""attributes"":{""n"":""2.""},""end"":2590,""start"":2586},{""attributes"":{""n"":""3.""},""end"":3028,""start"":3019},{""attributes"":{""n"":""4.""},""end"":5858,""start"":5840},{""attributes"":{""n"":""5.""},""end"":8695,""start"":8676},{""attr...",test,44,7,37,1,1,1
1,18536313,2473287144.0,10.18653/v1/s16-1025,,2016,2016-06-01,CCBY,1,https://aclanthology.org/S16-1025,1,1,1,2473287144,10.18653/v1/s16-1025,en,10664,Sentiment Analysis and Opinion Mining,3817,16,1702,Artificial Intelligence,151180,0,17,Computer Science,3,Physical Sciences,doi,NRU-HSE at SemEval-2016 Task 4: Comparative Analysis of Two Iterative Methods Using Quantification Library,"In many areas, such as social science, politics or market research, people need to track sentiment and their changes over time. For sentiment analysis in this field it is more important to correctly estimate proportions of each sentiment expressed in the set of documents (quantification task) th...","\nIntroduction\n\nIn many areas, such as customer-relationship management or opinion mining, people need to track changes over time and measure proportions of documents expressing different sentiments. In these situations, the task of accurate categorization of each document is replaced by the t...","[{""attributes"":null,""end"":604,""start"":15},{""attributes"":null,""end"":1033,""start"":606},{""attributes"":null,""end"":1759,""start"":1035},{""attributes"":null,""end"":1950,""start"":1761},{""attributes"":null,""end"":2307,""start"":1952},{""attributes"":null,""end"":2419,""start"":2333},{""attributes"":null,""end"":2592,""star...","[{""attributes"":{""n"":""1""},""end"":13,""start"":1},{""attributes"":{""n"":""2""},""end"":2331,""start"":2309},{""attributes"":{""n"":""2.1""},""end"":3258,""start"":3240},{""attributes"":{""n"":""2.2""},""end"":3896,""start"":3869},{""attributes"":{""n"":""2.3""},""end"":4960,""start"":4928},{""attributes"":{""n"":""2.4""},""end"":5261,""start"":5220...",test,80,15,65,1,1,1
2,248581257,,10.32473/flairs.v35i.130601,,2022,2022-05-04,CCBYNC,1,https://doi.org/10.32473/flairs.v35i.130601,0,1,1,4225386884,10.32473/flairs.v35i.130601,en,10664,Sentiment Analysis and Opinion Mining,3817,16,1702,Artificial Intelligence,151180,0,17,Computer Science,3,Physical Sciences,doi,Exploring BERT for Aspect-based Sentiment Analysis in Portuguese Language,"Aspect-Based Sentiment Analysis (ABSA) is a Natural Language Processing (NLP) task that extracts referred aspects from text and assigns polarities to opinions about those aspects. Most research on ABSA focuses on English. Only a few ABSA works deal with the Portuguese language. In this work, we ...","\nIntroduction\n\nSentiment Analysis (SA) is the field in Natural Language Processing (NLP) that automatically analyzes people's sentiments or opinions towards some entity. These sentiments can be valuable sources of information about the consumer's feelings about a particular product or idea, w...","[{""attributes"":null,""end"":361,""start"":15},{""attributes"":null,""end"":586,""start"":363},{""attributes"":null,""end"":762,""start"":588},{""attributes"":null,""end"":1021,""start"":764},{""attributes"":null,""end"":1982,""start"":1023},{""attributes"":null,""end"":2155,""start"":1984},{""attributes"":null,""end"":2370,""start"":2...","[{""attributes"":null,""end"":13,""start"":1},{""attributes"":null,""end"":2401,""start"":2372},{""attributes"":null,""end"":3843,""start"":3812},{""attributes"":null,""end"":4697,""start"":4693},{""attributes"":null,""end"":5025,""start"":5001},{""attributes"":null,""end"":5914,""start"":5901},{""attributes"":null,""end"":10349,""star...",test,51,10,41,1,1,1


Unnamed: 0,c
0,388735


 :: "02_stg".stg_unified_works_filtered_with_section_stats queries finished | since_start: 44.75 seconds | since_last: 8.14 seconds :: 


' :: "02_stg".stg_unified_works_filtered_with_section_stats queries finished | since_start: 44.75 seconds | since_last: 8.14 seconds :: '

In [17]:
table_name = 'stg_filtered_work_sections_with_headers'
db_name = '02_stg'
timelogger = utils.TimeLogger()
utils.create_table_from_sql_file(
    database_name=db_name,
    table_name=table_name,
    overwrite_strategy='overwrite', # options: fail, overwrite, ignore
    wait=True,
    # s3_parent_target_path=config.S3_CORE_DATA_PATH
)
timelogger.log(f'"{db_name}".{table_name} created')

utils.pd_set_options(cols=300)
display(wr.athena.read_sql_query(f"""SELECT * FROM "{db_name}".{table_name} LIMIT 3 """, db_name))
display(wr.athena.read_sql_query(f"""SELECT COUNT(*) AS c FROM "{db_name}".{table_name} """, db_name))
timelogger.log(f'"{db_name}".{table_name} queries finished')

 :: :: TIMELOGGER STARTED :: | since_start: 0.00 seconds | since_last: 0.00 seconds :: 
Table 02_stg.stg_filtered_work_sections_with_headers already exists. Overwriting since overwrite_strategy=="overwrite".
Deleting table from Glue Catalog 02_stg stg_filtered_work_sections_with_headers
Deleting S3 objects from s3://sagemaker-research-methodology-extraction/01_data/02_stg/stg_filtered_work_sections_with_headers/
s3_parent_target_path:  s3://sagemaker-research-methodology-extraction/01_data/02_stg
 :: "02_stg".stg_filtered_work_sections_with_headers created | since_start: 32.30 seconds | since_last: 32.30 seconds :: 


Unnamed: 0,work_id,section_index,section_type_index,n,section_start,section_end,section_type,section_text,section_text_length,work_has_sections,work_has_headers,work_has_paragraphs,work_number_of_sections,work_number_of_headers,work_number_of_paragraphs
0,269217375,24,19,,17967,18221,paragraph,"In this work, the data underwent a series of transformations, including reduction, the application of principal component analysis (PCA), cleaning, partitioning, and augmentation.The preprocessing procedures employed in this study are depicted in Fig. 1.",254,1,1,1,66,16,50
1,237624416,66,13,,23069,23108,header,2-Presenting and discussing the results,39,1,1,1,95,21,74
2,237624416,67,14,,23110,23167,header,2-1 Description of the research sample (demographic data),57,1,1,1,95,21,74


Unnamed: 0,c
0,34356297


 :: "02_stg".stg_filtered_work_sections_with_headers queries finished | since_start: 40.98 seconds | since_last: 8.68 seconds :: 


' :: "02_stg".stg_filtered_work_sections_with_headers queries finished | since_start: 40.98 seconds | since_last: 8.68 seconds :: '

In [18]:
table_name = 'stg_filtered_work_chapters'
db_name = '02_stg'
timelogger = utils.TimeLogger()
utils.create_table_from_sql_file(
    database_name=db_name,
    table_name=table_name,
    overwrite_strategy='overwrite', # options: fail, overwrite, ignore
    wait=True,
    # s3_parent_target_path=config.S3_CORE_DATA_PATH
)
timelogger.log(f'"{db_name}".{table_name} created')

utils.pd_set_options(cols=300)
display(wr.athena.read_sql_query(f"""SELECT * FROM "{db_name}".{table_name} LIMIT 3 """, db_name))
display(wr.athena.read_sql_query(f"""SELECT COUNT(*) AS c FROM "{db_name}".{table_name} """, db_name))
timelogger.log(f'"{db_name}".{table_name} queries finished')

 :: :: TIMELOGGER STARTED :: | since_start: 0.00 seconds | since_last: 0.00 seconds :: 
Table 02_stg.stg_filtered_work_chapters already exists. Overwriting since overwrite_strategy=="overwrite".
Deleting table from Glue Catalog 02_stg stg_filtered_work_chapters
Deleting S3 objects from s3://sagemaker-research-methodology-extraction/01_data/02_stg/stg_filtered_work_chapters/
s3_parent_target_path:  s3://sagemaker-research-methodology-extraction/01_data/02_stg
 :: "02_stg".stg_filtered_work_chapters created | since_start: 48.83 seconds | since_last: 48.83 seconds :: 


Unnamed: 0,work_id,header_index,number_of_paragraphs_in_chapter,min_section_start,max_section_end,header_text,header_text_lower,header_text_length,chapter_text,chapter_text_length,section_starts,section_ends,section_indices,section_type_indices,chapter_is_research_methodology
0,233138006,16,2,36623,38103,VI. DISCUSSION,vi. discussion,14,"The behavior-based analysis focuses on selecting features based on a particular concept or pattern that can extract different behavior patterns over time. In this case, we chose the flow-based features based on the theoretical relationship between the command and control server that is used by t...",1462,"[36623, 36639, 37744]","[36637, 37742, 38103]","[92, 93, 94]","[16, 77, 78]",
1,233138006,17,29,38105,40274,VII. CONCLUSION,vii. conclusion,15,"As mention in the literature review referring to TABLE 3, our outcome is in total contrast with the previous researcher's result. TABLE 3 shows that oversampling improves the result that produces by the classifier. However, surprisingly, oversampling in our research did not show any significant ...",2060,"[38105, 38122, 38656, 39184, 40137, 40142, 40147, 40152, 40157, 40163, 40168, 40173, 40183, 40188, 40197, 40202, 40207, 40212, 40217, 40222, 40227, 40232, 40237, 40242, 40247, 40252, 40257, 40262, 40268, 40273]","[38120, 38654, 39182, 40133, 40138, 40143, 40148, 40153, 40159, 40164, 40169, 40179, 40184, 40193, 40198, 40203, 40208, 40213, 40218, 40223, 40228, 40233, 40238, 40243, 40248, 40253, 40258, 40264, 40269, 40274]","[95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124]","[17, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107]",
2,233138012,1,23,1,8542,I. INTRODUCTION,i. introduction,15,"Cognitive radio network (CRN), which allows the primary users to share the spectrum with secondary users without impairing the quality of service (QoS) of primary users, is a promising scheme to improve the spectrum utilization efficiency [1].\n\nInformation security is a critically important is...",8501,"[1, 18, 264, 595, 991, 1208, 2748, 3515, 4478, 5226, 5726, 6057, 6305, 7093, 7227, 7423, 7522, 7579, 7754, 7879, 8046, 8109, 8297, 8416]","[16, 262, 593, 989, 1206, 2746, 3513, 4476, 5224, 5724, 6055, 6303, 7091, 7225, 7421, 7520, 7577, 7752, 7877, 8044, 8107, 8295, 8414, 8542]","[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24]","[1, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23]",


Unnamed: 0,c
0,6696116


 :: "02_stg".stg_filtered_work_chapters queries finished | since_start: 57.01 seconds | since_last: 8.19 seconds :: 


' :: "02_stg".stg_filtered_work_chapters queries finished | since_start: 57.01 seconds | since_last: 8.19 seconds :: '

In [19]:
table_name = 'stg_filtered_work_chapters_methodology'
db_name = '02_stg'
timelogger = utils.TimeLogger()
utils.create_table_from_sql_file(
    database_name=db_name,
    table_name=table_name,
    overwrite_strategy='overwrite', # options: fail, overwrite, ignore
    wait=True,
    # s3_parent_target_path=config.S3_CORE_DATA_PATH
)
timelogger.log(f'"{db_name}".{table_name} created')

utils.pd_set_options(cols=300)
display(wr.athena.read_sql_query(f"""SELECT * FROM "{db_name}".{table_name} LIMIT 3 """, db_name))
display(wr.athena.read_sql_query(f"""SELECT COUNT(*) AS c FROM "{db_name}".{table_name} """, db_name))
timelogger.log(f'"{db_name}".{table_name} queries finished')

 :: :: TIMELOGGER STARTED :: | since_start: 0.00 seconds | since_last: 0.00 seconds :: 
Table 02_stg.stg_filtered_work_chapters_methodology already exists. Overwriting since overwrite_strategy=="overwrite".
Deleting table from Glue Catalog 02_stg stg_filtered_work_chapters_methodology
Deleting S3 objects from s3://sagemaker-research-methodology-extraction/01_data/02_stg/stg_filtered_work_chapters_methodology/
s3_parent_target_path:  s3://sagemaker-research-methodology-extraction/01_data/02_stg
 :: "02_stg".stg_filtered_work_chapters_methodology created | since_start: 9.69 seconds | since_last: 9.69 seconds :: 


Unnamed: 0,work_id,header_index,number_of_paragraphs_in_chapter,min_section_start,max_section_end,header_text,header_text_lower,header_text_length,chapter_text,chapter_text_length,section_starts,section_ends,section_indices,section_type_indices,chapter_is_research_methodology,number_of_methodology_chapters_per_work
0,468163,11,6,26374,29390,Methodology,methodology,11,This proposal is based on the concepts of Inter-Domain Messaging (IDM) [41] and Virtual Quality-of-service Networks (VQN) [42]. IDM is a novel solution for transporting messages in a heterogeneous environment. IDM was designed as a general purpose protocol for providing a data transport service ...,2997,"[26374, 26387, 27217, 27797, 28333, 29344, 29368]","[26385, 27215, 27795, 28331, 29342, 29366, 29390]","[52, 53, 54, 55, 56, 57, 58]","[11, 42, 43, 44, 45, 46, 47]",1,1
1,1246494,12,7,24646,27226,Methodology,methodology,11,"We used simulation to prove the performance of our solution. The simulation tool adopted was Tossim (http://docs.tinyos.net/tinywiki/index.php/TOSSIM), because we have the device kits of Crossbow (http://www.xbow.com) to later perform testbeds on field and improve our solution. This kind of devi...",2560,"[24646, 24659, 25034, 25334, 25561, 25916, 26303, 26759]","[24657, 25032, 25332, 25559, 25914, 26301, 26757, 27226]","[59, 60, 61, 62, 63, 64, 65, 66]","[12, 48, 49, 50, 51, 52, 53, 54]",1,1
2,1477381,5,1,9135,9785,Methodology,methodology,11,"The first step towards fostering the use of e-Administration among persons with disabilities consists in ascertaining the specific difficulties they face when they wish to carry out a procedure with the administration. A quantitative analysis was designed, requiring (i) a selection of a signific...",636,"[9135, 9148]","[9146, 9785]","[27, 28]","[5, 23]",1,1


Unnamed: 0,c
0,34034


 :: "02_stg".stg_filtered_work_chapters_methodology queries finished | since_start: 18.30 seconds | since_last: 8.61 seconds :: 


' :: "02_stg".stg_filtered_work_chapters_methodology queries finished | since_start: 18.30 seconds | since_last: 8.61 seconds :: '

In [20]:
table_name = 'stg_filtered_work_chapters_methodology_single'
db_name = '02_stg'
timelogger = utils.TimeLogger()
utils.create_table_from_sql_file(
    database_name=db_name,
    table_name=table_name,
    overwrite_strategy='overwrite', # options: fail, overwrite, ignore
    wait=True,
    # s3_parent_target_path=config.S3_CORE_DATA_PATH
)
timelogger.log(f'"{db_name}".{table_name} created')

utils.pd_set_options(cols=300)
display(wr.athena.read_sql_query(f"""SELECT * FROM "{db_name}".{table_name} LIMIT 3 """, db_name))
display(wr.athena.read_sql_query(f"""SELECT COUNT(*) AS c FROM "{db_name}".{table_name} """, db_name))
timelogger.log(f'"{db_name}".{table_name} queries finished')

 :: :: TIMELOGGER STARTED :: | since_start: 0.00 seconds | since_last: 0.00 seconds :: 
Table 02_stg.stg_filtered_work_chapters_methodology_single already exists. Overwriting since overwrite_strategy=="overwrite".
Deleting table from Glue Catalog 02_stg stg_filtered_work_chapters_methodology_single
Deleting S3 objects from s3://sagemaker-research-methodology-extraction/01_data/02_stg/stg_filtered_work_chapters_methodology_single/
s3_parent_target_path:  s3://sagemaker-research-methodology-extraction/01_data/02_stg
 :: "02_stg".stg_filtered_work_chapters_methodology_single created | since_start: 6.98 seconds | since_last: 6.98 seconds :: 


Unnamed: 0,work_id,header_index,number_of_paragraphs_in_chapter,min_section_start,max_section_end,header_text,header_text_lower,header_text_length,chapter_text,chapter_text_length,section_starts,section_ends,section_indices,section_type_indices,chapter_is_research_methodology,number_of_methodology_chapters_per_work
0,788054,7,1,13492,15249,Methodology,methodology,11,"To facilitate establishment and management of the VE and the integration and interoperation of business processes, a SOA for ontology-based MAS is developed. The MAS contains agents that communicate with each other within a distributed and interoperable environment. The system is implemented in ...",1743,"[13492, 13505]","[13503, 15249]","[21, 22]","[7, 15]",1,1
1,5890185,4,1,8902,9171,Methodology,methodology,11,"In this section, we introduce the proposed framework for the breakdown of VQA. As illustrated in Figure 3, the framework consists of three modules: word prediction, sentence generation, and answer reasoning. Next, we describe the three modules in details.",255,"[8902, 8915]","[8913, 9171]","[17, 18]","[4, 14]",1,1
2,6458516,5,2,11423,12635,Study methodology,study methodology,17,"To evaluate our approach, we conducted a study with nine participants (3 female, 6 male, between 20 and 27 years). Participants either did not require prescription glasses or wore contact lenses. Each participant was given a pair of smart glasses. For heart rate reference measurements, participa...",1191,"[11423, 11442, 12127]","[11440, 12125, 12635]","[24, 25, 26]","[5, 20, 21]",1,1


Unnamed: 0,c
0,31595


 :: "02_stg".stg_filtered_work_chapters_methodology_single queries finished | since_start: 16.70 seconds | since_last: 9.72 seconds :: 


' :: "02_stg".stg_filtered_work_chapters_methodology_single queries finished | since_start: 16.70 seconds | since_last: 9.72 seconds :: '