In [1]:
import requests
import boto3
import json
import os
import sys
import argparse
import importlib
import transformers
import torch
import pathlib
import awswrangler as wr
from IPython.display import display
from sagemaker.huggingface.processing import HuggingFaceProcessor
from sagemaker.sklearn.processing import SKLearnProcessor
from sagemaker.processing import FrameworkProcessor
from sagemaker.sklearn.estimator import SKLearn
from sagemaker.workflow.steps import ProcessingStep
from sagemaker.workflow.pipeline_context import PipelineSession
from sagemaker.processing import ProcessingInput, ProcessingOutput
from sagemaker.session import get_execution_role


# Adding ../01_modules or ./01_modules to the system path so that we can load modules from 
# there as well
if '__file__' in globals():
    script_dir = pathlib.Path(__file__).parent.resolve()
else:
    script_dir = pathlib.Path().absolute()
modules_path_in_dev = os.path.abspath(os.path.join(script_dir, '..', '01_modules'))
modules_path_in_prod = os.path.abspath(os.path.join(script_dir, '01_modules'))
if os.path.exists(modules_path_in_dev):
    sys.path.append(modules_path_in_dev)
if os.path.exists(modules_path_in_prod):
    sys.path.append(modules_path_in_prod)


# # Jupyter only reads a local module the first time after 
# # kernel start. Re-running a cell with 
# # "from mymodulename import *" would not change
# # anything, even if the imported module has since changed.
# # As a workaround, we need to directly load the module, 
# # use importlib.reload to reload it and then import * 
import utils
_ = importlib.reload(utils)
import config
_ = importlib.reload(config) 


sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/sagemaker-user/.config/sagemaker/config.yaml
config.py loaded: v0.1
utils.py loaded: v0.2.12
utils.py loaded: v0.2.12
config.py loaded: v0.1


In [2]:
if False: # do not run this again, takes a day
    execution_role = get_execution_role()
    source_dir = os.path.abspath(os.path.join(os.getcwd(), '..', '..'))
    print('source_dir:', source_dir)
    sklearn_processor = FrameworkProcessor(
        estimator_cls=SKLearn,
        framework_version='1.2-1', # The newest supported version by sagemaker
        instance_type='ml.c7i.16xlarge',
        instance_count=1,
        base_job_name=f'openalex_works_reduction'.replace('_','-'),
        role=execution_role
    )
    
    step_args = sklearn_processor.run(
        code='src/03_transformation/03_11_transformation_openalex_works_reduction.py',
        source_dir=source_dir,
        inputs=[], # We are not using automatic input-output mapping, instead we handle everything in the script directly on S3
        outputs=[],
        arguments=[
            '--runtype', 'prod',
            '--file-max-limit', '10000',
        ],
        wait=True
    )

In [30]:
table_name = 'base_openalex_works_reduced_filtered'
timelogger = utils.TimeLogger()
utils.create_table_from_sql_file(
    database_name = '02_stg',
    table_name = table_name,
    overwrite_strategy='overwrite', # options: fail, overwrite, ignore
    wait=True,
)
timelogger.log(f'"02_stg".{table_name} created')

utils.pd_set_options(cols=500)
display(wr.athena.read_sql_query(f"""SELECT * FROM "02_stg".{table_name} LIMIT 3 """, '02_stg'))
display(wr.athena.read_sql_query(f"""SELECT COUNT(*) AS c FROM "02_stg".{table_name} """, '02_stg'))
timelogger.log(f'"02_stg".{table_name} queries finished')

 :: :: TIMELOGGER STARTED :: | since_start: 0.00 seconds | since_last: 0.00 seconds :: 
Deleting S3 objects from s3://sagemaker-research-methodology-extraction/01_data/02_stg/base_openalex_works_reduced_filtered/
s3_parent_target_path:  s3://sagemaker-research-methodology-extraction/01_data/02_stg
 :: "02_stg".base_openalex_works_reduced_filtered created | since_start: 42.36 seconds | since_last: 42.36 seconds :: 


Unnamed: 0,id_openalex,id_doi,title,language,primary_topic_id,primary_topic_display_name,primary_topic_subfield_id,primary_topic_subfield_display_name,primary_topic_field_id,primary_topic_field_display_name,primary_topic_domain_id,primary_topic_domain_display_name
0,2184395120,,AN APPROACH FOR COMPRESSING DIGITAL IMAGES BY USING RUN LENGTH ENCODING,en,T10901,Advanced Data Compression Techniques,1707,Computer Vision and Pattern Recognition,17,Computer Science,3,Physical Sciences
1,2184397778,,LEARNINGMINIMUM VOLUME SETSWITH SUPPORTVECTOR MACHINES,en,T11512,Anomaly Detection Techniques and Applications,1702,Artificial Intelligence,17,Computer Science,3,Physical Sciences
2,2184397944,,Analyzing thePerformance ofVoice over Internet Protocol ina 3GNetwork,en,T10575,Wireless Communication Networks Research,1705,Computer Networks and Communications,17,Computer Science,3,Physical Sciences


Unnamed: 0,c
0,13242469


 :: "02_stg".base_openalex_works_reduced_filtered queries finished | since_start: 50.44 seconds | since_last: 8.08 seconds :: 


' :: "02_stg".base_openalex_works_reduced_filtered queries finished | since_start: 50.44 seconds | since_last: 8.08 seconds :: '

In [3]:
table_name = 'base_semanticscholar_s2orcv2'
timelogger = utils.TimeLogger()
utils.create_table_from_sql_file(
    database_name = '02_stg',
    table_name = table_name,
    overwrite_strategy='overwrite', # options: fail, overwrite, ignore
    wait=True,
)
timelogger.log(f'"02_stg".{table_name} created')

utils.pd_set_options(cols=500)
display(wr.athena.read_sql_query(f"""SELECT * FROM "02_stg".{table_name} LIMIT 3 """, '02_stg'))
display(wr.athena.read_sql_query(f"""SELECT COUNT(*) AS c FROM "02_stg".{table_name} """, '02_stg'))
timelogger.log(f'"02_stg".{table_name} queries finished')

 :: :: TIMELOGGER STARTED :: | since_start: 0.00 seconds | since_last: 0.00 seconds :: 
Table 02_stg.base_semanticscholar_s2orcv2 already exists. Overwriting since overwrite_strategy=="overwrite".
Deleting table from Glue Catalog 02_stg base_semanticscholar_s2orcv2
Deleting S3 objects from s3://sagemaker-research-methodology-extraction/01_data/02_stg/base_semanticscholar_s2orcv2/
s3_parent_target_path:  s3://sagemaker-research-methodology-extraction/01_data/02_stg
 :: "02_stg".base_semanticscholar_s2orcv2 created | since_start: 11.0 minutes, 1.71 seconds | since_last: 11.0 minutes, 1.71 seconds :: 


Unnamed: 0,id_semanticscholar,id_mag,id_doi,id_arxiv,title,source_url,openaccess_status,content_text,annotations_paragraph,annotations_section_header,license
0,1808709,2101143116,10.1186/1476-511x-13-132,,Twist 1 regulates the expression of PPARγ during hormone-induced 3T3-L1 preadipocyte differentiation: a possible role in obesity and associated diseases,https://pmc.ncbi.nlm.nih.gov/articles/PMC4150960,GOLD,"\nBackground\n\nObesity has become an epidemic in the human population, and China has the highest number of obese patients in the world [1]. Because obesity involves an increase in the number of adipocytes, any of the factors involved in adipocyte differentiation might be of great importance for the development of obesity. To date, numerous factors and proteins have been implicated in the generation of new fat cells, including peroxisome proliferator-activated receptor gamma (PPARγ) [2,3], C...","[{""attributes"":null,""end"":781,""start"":13},{""attributes"":null,""end"":2087,""start"":783},{""attributes"":null,""end"":3470,""start"":2089},{""attributes"":null,""end"":4312,""start"":3472},{""attributes"":null,""end"":5329,""start"":4323},{""attributes"":null,""end"":5818,""start"":5409},{""attributes"":null,""end"":6151,""start"":5890},{""attributes"":null,""end"":6744,""start"":6218},{""attributes"":null,""end"":8054,""start"":6870},{""attributes"":null,""end"":8511,""start"":8056},{""attributes"":null,""end"":9031,""start"":8525},{""attributes"":n...","[{""attributes"":null,""end"":11,""start"":1},{""attributes"":null,""end"":4321,""start"":4314},{""attributes"":null,""end"":5407,""start"":5331},{""attributes"":null,""end"":5888,""start"":5820},{""attributes"":null,""end"":6216,""start"":6153},{""attributes"":null,""end"":6868,""start"":6746},{""attributes"":null,""end"":8523,""start"":8513},{""attributes"":null,""end"":16548,""start"":16527},{""attributes"":null,""end"":16559,""start"":16550},{""attributes"":null,""end"":17519,""start"":17439},{""attributes"":null,""end"":19472,""start"":19445},{""attrib...",CCBY
1,3738414,2789005638,10.1155/2018/1042479,,PROM and Labour Effects on Urinary Metabolome: A Pilot Study,https://pmc.ncbi.nlm.nih.gov/articles/PMC5817378,GOLD,"\nIntroduction\n\nThe early diagnosis of pregnancy-related complications and the prediction of pregnancy outcome are considered strategic clinical goals to ensure the health of mothers and of their babies. Among these, premature rupture of membranes (PROM) consists of the rupture of the foetal membranes before the onset of labour. It can be observed at any gestational age [1] and occurs in approximately 10% of pregnant women and in roughly 40% of preterm deliveries [2]. Foetal membranes are ...","[{""attributes"":null,""end"":4623,""start"":15},{""attributes"":null,""end"":6006,""start"":4703},{""attributes"":null,""end"":7040,""start"":6040},{""attributes"":null,""end"":7808,""start"":7063},{""attributes"":null,""end"":9464,""start"":7836},{""attributes"":null,""end"":10835,""start"":9466},{""attributes"":null,""end"":11411,""start"":10861},{""attributes"":null,""end"":12575,""start"":11413},{""attributes"":null,""end"":12831,""start"":12585},{""attributes"":null,""end"":12984,""start"":12833},{""attributes"":null,""end"":13373,""start"":12986},{""...","[{""attributes"":{""n"":""1.""},""end"":13,""start"":1},{""attributes"":{""n"":""2.""},""end"":4646,""start"":4625},{""attributes"":{""n"":""2.1.""},""end"":4701,""start"":4648},{""attributes"":{""n"":""2.2.""},""end"":6038,""start"":6008},{""attributes"":{""n"":""2.3.""},""end"":7061,""start"":7042},{""attributes"":{""n"":""2.4.""},""end"":7834,""start"":7810},{""attributes"":{""n"":""3.""},""end"":10859,""start"":10837},{""attributes"":{""n"":""3.3.""},""end"":12583,""start"":12577},{""attributes"":{""n"":""4.""},""end"":13385,""start"":13375},{""attributes"":{""n"":""5.""},""end"":167...",CCBY
2,15710447,2741402565,10.18653/v1/w17-1005,,Word Embedding and Topic Modeling Enhanced Multiple Features for Content Linking and Argument / Sentiment Labeling in Online Forums,https://aclanthology.org/W17-1005,HYBRID,"\nIntroduction\n\nComments to news and their providers in online forums have been increasing rapidly in recent years with a large number of user participants and huge amount of interactive contents. How can we understand the mass of comments effectively? A crucial initial step towards this goal should be content linking, which is to determine what comments link to, be that either specific news snippets or comments by other users. Furthermore, a set of labels for a given link may be articulat...","[{""attributes"":null,""end"":585,""start"":15},{""attributes"":null,""end"":813,""start"":587},{""attributes"":null,""end"":1141,""start"":815},{""attributes"":null,""end"":1511,""start"":1143},{""attributes"":null,""end"":2352,""start"":1531},{""attributes"":null,""end"":3126,""start"":2363},{""attributes"":null,""end"":3193,""start"":3145},{""attributes"":null,""end"":3418,""start"":3211},{""attributes"":null,""end"":3543,""start"":3452},{""attributes"":null,""end"":3672,""start"":3545},{""attributes"":null,""end"":3984,""start"":3674},{""attributes"":nul...","[{""attributes"":{""n"":""1""},""end"":13,""start"":1},{""attributes"":{""n"":""2""},""end"":1529,""start"":1513},{""attributes"":{""n"":""3""},""end"":2361,""start"":2354},{""attributes"":{""n"":""3.1""},""end"":3143,""start"":3128},{""attributes"":{""n"":""3.1.1""},""end"":3209,""start"":3195},{""attributes"":{""n"":""3.1.2""},""end"":3450,""start"":3420},{""attributes"":{""n"":""3.1.3""},""end"":5187,""start"":5154},{""attributes"":{""n"":""3.2""},""end"":6316,""start"":6302},{""attributes"":null,""end"":7038,""start"":7024},{""attributes"":null,""end"":7173,""start"":7150},{""at...",CCBY


Unnamed: 0,c
0,11609787


 :: "02_stg".base_semanticscholar_s2orcv2 queries finished | since_start: 11.0 minutes, 11.75 seconds | since_last: 10.04 seconds :: 


' :: "02_stg".base_semanticscholar_s2orcv2 queries finished | since_start: 11.0 minutes, 11.75 seconds | since_last: 10.04 seconds :: '

In [4]:
table_name = 'stg_semanticscholar_combined_works'
timelogger = utils.TimeLogger()
utils.create_table_from_sql_file(
    database_name = '02_stg',
    table_name = table_name,
    overwrite_strategy='overwrite', # options: fail, overwrite, ignore
    wait=True,
)
timelogger.log(f'"02_stg".{table_name} created')

utils.pd_set_options(cols=500)
display(wr.athena.read_sql_query(f"""SELECT * FROM "02_stg".{table_name} LIMIT 3 """, '02_stg'))
display(wr.athena.read_sql_query(f"""SELECT COUNT(*) AS c FROM "02_stg".{table_name} """, '02_stg'))
timelogger.log(f'"02_stg".{table_name} queries finished')

 :: :: TIMELOGGER STARTED :: | since_start: 0.00 seconds | since_last: 0.00 seconds :: 
Table 02_stg.stg_semanticscholar_combined_works already exists. Overwriting since overwrite_strategy=="overwrite".
Deleting table from Glue Catalog 02_stg stg_semanticscholar_combined_works
Deleting S3 objects from s3://sagemaker-research-methodology-extraction/01_data/02_stg/stg_semanticscholar_combined_works/
s3_parent_target_path:  s3://sagemaker-research-methodology-extraction/01_data/02_stg
 :: "02_stg".stg_semanticscholar_combined_works created | since_start: 13.0 minutes, 48.32 seconds | since_last: 13.0 minutes, 48.32 seconds :: 


Unnamed: 0,id_semanticscholar,id_mag,id_doi,id_arxiv,title,source_url,openaccess_status,content_text,annotations_paragraph,annotations_section_header,license,content_abstract,publication_year,publication_date
0,220531160,3042431448.0,10.1167/iovs.61.8.16,,Quantitative Fundus Autofluorescence in Rhesus Macaques in Aging and Age-Related Drusen,https://pmc.ncbi.nlm.nih.gov/articles/PMC7425688,GOLD,"\nD iseases of the macula, such as age-related macular degeneration (AMD) and diabetic macular edema, are leading causes of visual impairment in developed countries. 1 Animal models of macular conditions can further detail the mechanisms of their pathogenesis and reveal new insights into developing novel interventions. Nonhuman primates (NHPs) are a compelling animal model for studying macular diseases as they are the only mammals beside humans to possess a true macula. NHPs, such as rhesus ...","[{""attributes"":null,""end"":1340,""start"":1},{""attributes"":null,""end"":2554,""start"":1342},{""attributes"":null,""end"":4618,""start"":2580},{""attributes"":null,""end"":6008,""start"":4640},{""attributes"":null,""end"":7798,""start"":6010},{""attributes"":null,""end"":9037,""start"":7824},{""attributes"":null,""end"":10039,""start"":9078},{""attributes"":null,""end"":11377,""start"":10063},{""attributes"":null,""end"":12469,""start"":11391},{""attributes"":null,""end"":13219,""start"":12492},{""attributes"":null,""end"":13875,""start"":13257},{""att...","[{""attributes"":null,""end"":2563,""start"":2556},{""attributes"":null,""end"":2578,""start"":2565},{""attributes"":null,""end"":4638,""start"":4620},{""attributes"":null,""end"":7822,""start"":7800},{""attributes"":null,""end"":9076,""start"":9039},{""attributes"":null,""end"":10061,""start"":10041},{""attributes"":null,""end"":11389,""start"":11379},{""attributes"":null,""end"":12478,""start"":12471},{""attributes"":null,""end"":12490,""start"":12480},{""attributes"":null,""end"":13255,""start"":13221},{""attributes"":null,""end"":14316,""start"":14284}...",CCBYNCND,"Purpose To employ quantitative fundus autofluorescence (qAF) imaging in rhesus macaques to noninvasively assess retinal pigment epithelial (RPE) lipofuscin in nonhuman primates (NHPs) as a model of aging and age-related macular degeneration (AMD). Methods The qAF imaging was performed on eyes of 26 rhesus macaques (mean age 18.8 ± 8.2 years, range 4–27 years) with normal-appearing fundus or with age-related soft drusen using a confocal scanning laser ophthalmoscope with 488 nm excitation and...",2020,2020-07-01
1,268446036,,10.1007/s40670-024-02017-9,,Humanism Rounds: A Multifaceted “Back to Bedside” Initiative to Improve Meaning at Work for Internal Medicine Residents,https://pmc.ncbi.nlm.nih.gov/articles/PMC11180076,HYBRID,"\nIntroduction\n\nBurnout affects medical residents nationwide, leading to poor resident wellbeing, career dissatisfaction, and decreased quality of patient care [1,2].The rates of burnout among residents range from 27 to 75%, with high rates noted in obstetrics and gynecology (75%), internal medicine (63%), and general surgery (40%) with the lowest rate among family medicine residents (27%) [3].Research into burnout during residency has focused on a variety of contributing factors including...","[{""attributes"":null,""end"":1032,""start"":15},{""attributes"":null,""end"":1487,""start"":1034},{""attributes"":null,""end"":1960,""start"":1489},{""attributes"":null,""end"":2269,""start"":1985},{""attributes"":null,""end"":3087,""start"":2291},{""attributes"":null,""end"":3559,""start"":3118},{""attributes"":null,""end"":4961,""start"":3612},{""attributes"":null,""end"":5991,""start"":4980},{""attributes"":null,""end"":6547,""start"":5993},{""attributes"":null,""end"":7174,""start"":6571},{""attributes"":null,""end"":7594,""start"":7185},{""attributes""...","[{""attributes"":null,""end"":13,""start"":1},{""attributes"":null,""end"":1983,""start"":1962},{""attributes"":null,""end"":2289,""start"":2271},{""attributes"":null,""end"":3103,""start"":3089},{""attributes"":null,""end"":3116,""start"":3105},{""attributes"":null,""end"":3590,""start"":3561},{""attributes"":null,""end"":3610,""start"":3592},{""attributes"":null,""end"":4978,""start"":4963},{""attributes"":null,""end"":6569,""start"":6549},{""attributes"":null,""end"":7183,""start"":7176},{""attributes"":null,""end"":7641,""start"":7596},{""attributes"":nu...",CCBY,"Introduction Burnout is an increasingly prevalent problem among resident physicians. To address this problem, the Accreditation Council on Graduate Medical Education (ACGME) created the Back to Bedside initiative, supporting resident-driven projects focused on increasing direct interactions with patients. In 2017, Baylor College of Medicine (BCM) Internal Medicine Residency received a Back to Bedside grant to develop and implement “Humanism Rounds,” a multifaceted program which sought to pro...",2024,2024-03-13
2,249401160,,10.1007/s12471-022-01700-z,,Major adverse cardiovascular events in older emergency department patients presenting with non-cardiac medical complaints,https://pmc.ncbi.nlm.nih.gov/articles/PMC9691805,GOLD,"\nIntroduction\n\nOlder patients are at high risk of adverse outcomes after an emergency department (ED) visit [1,2]. However, the risk of major adverse cardiovascular events (MACE) for older ED patients, presenting with noncardiac medical complaints, is unknown. Because preventive measures may improve outcome [3], early identification of patients at risk is highly important. \n\nBesides conventional cardiovascular risk factors, the cardiac biomarkers high-sensitivity cardiac Troponin T (hs-...","[{""attributes"":null,""end"":376,""start"":15},{""attributes"":null,""end"":783,""start"":378},{""attributes"":null,""end"":1290,""start"":785},{""attributes"":null,""end"":2491,""start"":1337},{""attributes"":null,""end"":2771,""start"":2493},{""attributes"":null,""end"":3143,""start"":2796},{""attributes"":null,""end"":4944,""start"":3162},{""attributes"":null,""end"":5900,""start"":4946},{""attributes"":null,""end"":6149,""start"":5902},{""attributes"":null,""end"":6368,""start"":6169},{""attributes"":null,""end"":6672,""start"":6385},{""attributes"":nul...","[{""attributes"":null,""end"":13,""start"":1},{""attributes"":null,""end"":1299,""start"":1292},{""attributes"":null,""end"":1335,""start"":1301},{""attributes"":null,""end"":2794,""start"":2773},{""attributes"":null,""end"":3160,""start"":3145},{""attributes"":null,""end"":6167,""start"":6151},{""attributes"":null,""end"":6383,""start"":6370},{""attributes"":null,""end"":8057,""start"":8050},{""attributes"":null,""end"":8103,""start"":8059},{""attributes"":null,""end"":9631,""start"":9566},{""attributes"":null,""end"":10054,""start"":10031},{""attributes"":...",CCBY,"The risk of major adverse cardiovascular events (MACE) for older emergency department (ED) patients presenting with non-cardiac medical complaints is unknown. To apply preventive measures timely, early identification of high-risk patients is incredibly important. We aimed at investigating the incidence of MACE within one year after their ED visit and the predictive value of high-sensitivity cardiac troponin T (hs-cTnT) and N‑terminal pro-B-type natriuretic peptide (NT-proBNP) for subsequent ...",2022,2022-06-07


Unnamed: 0,c
0,11609787


 :: "02_stg".stg_semanticscholar_combined_works queries finished | since_start: 13.0 minutes, 58.61 seconds | since_last: 10.29 seconds :: 


' :: "02_stg".stg_semanticscholar_combined_works queries finished | since_start: 13.0 minutes, 58.61 seconds | since_last: 10.29 seconds :: '

In [18]:
table_name = 'stg_semanticscholar_combined_works_content'
timelogger = utils.TimeLogger()
utils.create_table_from_sql_file(
    database_name = '02_stg',
    table_name = table_name,
    overwrite_strategy='overwrite', # options: fail, overwrite, ignore
    wait=True,
)
timelogger.log(f'"02_stg".{table_name} created')

utils.pd_set_options(cols=300)
display(wr.athena.read_sql_query(f"""SELECT * FROM "02_stg".{table_name} LIMIT 3 """, '02_stg'))
display(wr.athena.read_sql_query(f"""SELECT COUNT(*) AS c FROM "02_stg".{table_name} """, '02_stg'))
timelogger.log(f'"02_stg".{table_name} queries finished')

 :: :: TIMELOGGER STARTED :: | since_start: 0.00 seconds | since_last: 0.00 seconds :: 
Deleting S3 objects from s3://sagemaker-research-methodology-extraction/01_data/02_stg/stg_semanticscholar_combined_works_content/
s3_parent_target_path:  s3://sagemaker-research-methodology-extraction/01_data/02_stg
 :: "02_stg".stg_semanticscholar_combined_works_content created | since_start: 11.0 minutes, 20.68 seconds | since_last: 11.0 minutes, 20.68 seconds :: 


Unnamed: 0,id_semanticscholar,title,content_abstract,content_text,annotations_paragraph,annotations_section_header
0,260163299,Screening depression and anxiety in Indigenous peoples: A global scoping review,"Indigenous peoples’ worldviews are intricately interconnected and interrelated with their communities and the environments in which they live. Their worldviews also manifest in a holistic view of health and well-being, which contrasts with those of the dominant western biomedical model. However,...","\nIntroduction\n\nThe worldviews of Indigenous peoples are intricately interrelated and interconnected with those of their communities and the environments in which they live. Indigenous people conceptualise health and well-being more holistically (Gall et al., 2021) than the dominant western bi...","[{""attributes"":null,""end"":512,""start"":15},{""attributes"":null,""end"":1314,""start"":514},{""attributes"":null,""end"":2095,""start"":1316},{""attributes"":null,""end"":3698,""start"":2097},{""attributes"":null,""end"":4904,""start"":3700},{""attributes"":null,""end"":5652,""start"":4906},{""attributes"":null,""end"":6585,""star...","[{""attributes"":null,""end"":13,""start"":1},{""attributes"":null,""end"":5660,""start"":5654},{""attributes"":null,""end"":7233,""start"":7215},{""attributes"":null,""end"":11222,""start"":11207},{""attributes"":null,""end"":12489,""start"":12475},{""attributes"":null,""end"":13587,""start"":13572},{""attributes"":null,""end"":14789..."
1,112601881,Model Development of a Blast Furnace Stove,,\nIntroduction\n\nAbout one third of the world primary energy consumption is from the manufacturing industries. The iron and steel industry (ISI) is the second largest energy user and accounts for 20 % of the energy usage by the manufacturing industries [1]. Due to heavy reliance on fossil fuels...,"[{""attributes"":null,""end"":519,""start"":15},{""attributes"":null,""end"":1303,""start"":521},{""attributes"":null,""end"":1735,""start"":1305},{""attributes"":null,""end"":2270,""start"":1737},{""attributes"":null,""end"":2995,""start"":2272},{""attributes"":null,""end"":3406,""start"":2997},{""attributes"":null,""end"":3691,""star...","[{""attributes"":{""n"":""1.""},""end"":13,""start"":1},{""attributes"":{""n"":""2.""},""end"":3414,""start"":3408},{""attributes"":null,""end"":4208,""start"":4172},{""attributes"":{""n"":""2.1.""},""end"":5536,""start"":5527},{""attributes"":{""n"":""2.2.""},""end"":7243,""start"":7228},{""attributes"":{""n"":""2.3.""},""end"":7941,""start"":7909},..."
2,118573517,Double beta decay transition mechanism,"After briefly reviewing $\beta \beta$ decay as a test of the neutrino mass, I examine the nuclear structure involved in this process. Simple formulas (\`{a} la Pad\'{e}) are designed for the transition amplitudes and the general behavior of $\beta \beta$ decay amplitudes in the quasiparticle ran...",\nIntroduction\n\nThe double beta (ββ) decay is a nice example of the interrelation between the Particle Physics and the Nuclear Physics: we can get information on the properties of the neutrino and the weak interaction from the ββ decay only if we know who to deal we the nuclear structure invol...,"[{""attributes"":null,""end"":582,""start"":15},{""attributes"":null,""end"":1069,""start"":584},{""attributes"":null,""end"":1224,""start"":1071},{""attributes"":null,""end"":1310,""start"":1226},{""attributes"":null,""end"":1343,""start"":1312},{""attributes"":null,""end"":1420,""start"":1345},{""attributes"":null,""end"":1839,""star...","[{""attributes"":{""n"":""1.""},""end"":13,""start"":1},{""attributes"":{""n"":""3.""},""end"":10044,""start"":9997},{""attributes"":{""n"":""4.""},""end"":14193,""start"":14165}]"


Unnamed: 0,c
0,11609787


 :: "02_stg".stg_semanticscholar_combined_works_content queries finished | since_start: 11.0 minutes, 31.19 seconds | since_last: 10.51 seconds :: 


' :: "02_stg".stg_semanticscholar_combined_works_content queries finished | since_start: 11.0 minutes, 31.19 seconds | since_last: 10.51 seconds :: '

In [19]:
table_name = 'stg_semanticscholar_combined_works_metadata'
timelogger = utils.TimeLogger()
utils.create_table_from_sql_file(
    database_name = '02_stg',
    table_name = table_name,
    overwrite_strategy='overwrite', # options: fail, overwrite, ignore
    wait=True,
)
timelogger.log(f'"02_stg".{table_name} created')

utils.pd_set_options(cols=300)
display(wr.athena.read_sql_query(f"""SELECT * FROM "02_stg".{table_name} LIMIT 3 """, '02_stg'))
display(wr.athena.read_sql_query(f"""SELECT COUNT(*) AS c FROM "02_stg".{table_name} """, '02_stg'))
timelogger.log(f'"02_stg".{table_name} queries finished')

 :: :: TIMELOGGER STARTED :: | since_start: 0.00 seconds | since_last: 0.00 seconds :: 
Deleting S3 objects from s3://sagemaker-research-methodology-extraction/01_data/02_stg/stg_semanticscholar_combined_works_metadata/
s3_parent_target_path:  s3://sagemaker-research-methodology-extraction/01_data/02_stg
 :: "02_stg".stg_semanticscholar_combined_works_metadata created | since_start: 5.85 seconds | since_last: 5.85 seconds :: 


Unnamed: 0,id_semanticscholar,id_mag,id_doi,id_arxiv,source_url,openaccess_status,license,publication_year,publication_date
0,46791531,2755160340.0,10.1364/oe.25.023899,,https://doi.org/10.1364/OE.25.023899,GOLD,CCBY,2017,2017-10-02
1,13140185,2010414254.0,10.3390/rs6031863,,https://doi.org/10.3390/rs6031863,GOLD,CCBY,2014,2014-02-28
2,253366618,,10.1016/j.xpro.2022.101803,,https://pmc.ncbi.nlm.nih.gov/articles/PMC9641055,GOLD,CCBYNCND,2022,2022-11-04


Unnamed: 0,c
0,11609787


 :: "02_stg".stg_semanticscholar_combined_works_metadata queries finished | since_start: 15.13 seconds | since_last: 9.28 seconds :: 


' :: "02_stg".stg_semanticscholar_combined_works_metadata queries finished | since_start: 15.13 seconds | since_last: 9.28 seconds :: '

In [5]:
table_name = 'base_arxiv_metadata'
timelogger = utils.TimeLogger()
utils.create_table_from_sql_file(
    database_name = '02_stg',
    table_name = table_name,
    overwrite_strategy='overwrite', # options: fail, overwrite, ignore
    wait=True,
)
timelogger.log(f'"02_stg".{table_name} created')

utils.pd_set_options(cols=500)
display(wr.athena.read_sql_query(f"""SELECT * FROM "02_stg".{table_name} LIMIT 3 """, '02_stg'))
display(wr.athena.read_sql_query(f"""SELECT COUNT(*) AS c FROM "02_stg".{table_name} """, '02_stg'))
timelogger.log(f'"02_stg".{table_name} queries finished')

 :: :: TIMELOGGER STARTED :: | since_start: 0.00 seconds | since_last: 0.00 seconds :: 
Table 02_stg.base_arxiv_metadata already exists. Overwriting since overwrite_strategy=="overwrite".
Deleting table from Glue Catalog 02_stg base_arxiv_metadata
Deleting S3 objects from s3://sagemaker-research-methodology-extraction/01_data/02_stg/base_arxiv_metadata/
s3_parent_target_path:  s3://sagemaker-research-methodology-extraction/01_data/02_stg
 :: "02_stg".base_arxiv_metadata created | since_start: 15.44 seconds | since_last: 15.44 seconds :: 


Unnamed: 0,id_arxiv,id_doi,title,abstract,license
0,903.1601,,Parabolic-Dish Solar Concentrators of Film on Foam,"Parabolic and spherical mirrors are constructed of aluminized PET polyester film on urethane foam. During construction, the chosen shape of the mirror is created by manipulating the elastic/plastic behavior of the film with air pressure. Foam is then applied to the film and, once hardened, air pressure is removed. At an f-number of 0.68, preliminary models have an optical angular spread of less than 0.25 degrees, a factor of 3.3 smaller than that for a perfectly spherical mirror. The possi...",ArXiv nonexclusive-distrib
1,903.1604,10.3842/SIGMA.2009.029,Limits of Gaudin Systems: Classical and Quantum Cases,"We consider the XXX homogeneous Gaudin system with $N$ sites, both in classical and the quantum case. In particular we show that a suitable limiting procedure for letting the poles of its Lax matrix collide can be used to define new families of Liouville integrals (in the classical case) and new ""Gaudin"" algebras (in the quantum case). We will especially treat the case of total collisions, that gives rise to (a generalization of) the so called Bending flows of Kapovich and Millson. Some as...",CCBYNCSA
2,903.16,,Typically Real Harmonic Functions,We consider a class $\THO$ of typically real harmonic functions on the unit disk that contains the class of normalized analytic and typically real functions. We also obtain some partial results about the region of univalence for this class.,ArXiv nonexclusive-distrib


Unnamed: 0,c
0,2816721


 :: "02_stg".base_arxiv_metadata queries finished | since_start: 24.78 seconds | since_last: 9.35 seconds :: 


' :: "02_stg".base_arxiv_metadata queries finished | since_start: 24.78 seconds | since_last: 9.35 seconds :: '

In [21]:
table_name = 'stg_unified_works_metadata_01_joined_to_arxiv'
timelogger = utils.TimeLogger()
utils.create_table_from_sql_file(
    database_name = '02_stg',
    table_name = table_name,
    overwrite_strategy='overwrite', # options: fail, overwrite, ignore
    wait=True,
)
timelogger.log(f'"02_stg".{table_name} created')

utils.pd_set_options(cols=300)
display(wr.athena.read_sql_query(f"""SELECT * FROM "02_stg".{table_name} LIMIT 3 """, '02_stg'))
display(wr.athena.read_sql_query(f"""SELECT COUNT(*) AS c FROM "02_stg".{table_name} """, '02_stg'))
timelogger.log(f'"02_stg".{table_name} queries finished')

 :: :: TIMELOGGER STARTED :: | since_start: 0.00 seconds | since_last: 0.00 seconds :: 
Deleting S3 objects from s3://sagemaker-research-methodology-extraction/01_data/02_stg/stg_unified_works_metadata_01_joined_to_arxiv/
s3_parent_target_path:  s3://sagemaker-research-methodology-extraction/01_data/02_stg
 :: "02_stg".stg_unified_works_metadata_01_joined_to_arxiv created | since_start: 11.79 seconds | since_last: 11.79 seconds :: 


Unnamed: 0,id_semanticscholar,id_mag,id_doi,id_arxiv,source_url,openaccess_status,publication_year,publication_date,license,license_allows_derivative_reuse
0,4951032,2789407911,10.1016/j.tecto.2018.03.010,,https://doi.org/10.1016/J.TECTO.2018.03.010,HYBRID,2018,2018-04-22,CCBY,1
1,204923380,2981732074,10.1016/j.jalz.2019.08.201,,https://pmc.ncbi.nlm.nih.gov/articles/PMC7012375,HYBRID,2019,2019-10-28,CCBYNCND,0
2,14519185,2114693455,10.1099/vir.0.007377-0,,https://pmc.ncbi.nlm.nih.gov/articles/PMC2885064,HYBRID,2009,2009-03-01,CCBY,1


Unnamed: 0,c
0,11609787


 :: "02_stg".stg_unified_works_metadata_01_joined_to_arxiv queries finished | since_start: 21.14 seconds | since_last: 9.35 seconds :: 


' :: "02_stg".stg_unified_works_metadata_01_joined_to_arxiv queries finished | since_start: 21.14 seconds | since_last: 9.35 seconds :: '

In [2]:
table_name = 'stg_unified_works_metadata_02_joined_to_openalex'
timelogger = utils.TimeLogger()
utils.create_table_from_sql_file(
    database_name = '02_stg',
    table_name = table_name,
    overwrite_strategy='overwrite', # options: fail, overwrite, ignore
    wait=True,
)
timelogger.log(f'"02_stg".{table_name} created')

utils.pd_set_options(cols=300)
display(wr.athena.read_sql_query(f"""SELECT * FROM "02_stg".{table_name} LIMIT 3 """, '02_stg'))
display(wr.athena.read_sql_query(f"""SELECT COUNT(*) AS c FROM "02_stg".{table_name} """, '02_stg'))
timelogger.log(f'"02_stg".{table_name} queries finished')

 :: :: TIMELOGGER STARTED :: | since_start: 0.00 seconds | since_last: 0.00 seconds :: 
Table 02_stg.stg_unified_works_metadata_02_joined_to_openalex already exists. Overwriting since overwrite_strategy=="overwrite".
Deleting table from Glue Catalog 02_stg stg_unified_works_metadata_02_joined_to_openalex
Deleting S3 objects from s3://sagemaker-research-methodology-extraction/01_data/02_stg/stg_unified_works_metadata_02_joined_to_openalex/
s3_parent_target_path:  s3://sagemaker-research-methodology-extraction/01_data/02_stg
 :: "02_stg".stg_unified_works_metadata_02_joined_to_openalex created | since_start: 10.35 seconds | since_last: 10.35 seconds :: 


Unnamed: 0,id_semanticscholar,id_mag,id_doi,id_arxiv,publication_year,publication_date,license,license_allows_derivative_reuse,source_url,has_id_mag,has_id_doi,has_id_mag_or_doi,openalex_id_openalex,openalex_id_doi,openalex_language,openalex_primary_topic_id,openalex_primary_topic_display_name,openalex_primary_topic_subfield_id,openalex_primary_topic_subfield_display_name,openalex_primary_topic_field_id,openalex_primary_topic_field_display_name,openalex_primary_topic_domain_id,openalex_primary_topic_domain_display_name,openalex_joined_on
0,235199166,,10.2196/23099,,2020,2020-07-31,CCBY,1,https://pmc.ncbi.nlm.nih.gov/articles/PMC8190645,0,1,1,3165206559,10.2196/23099,en,10028,Topic Modeling,1702,Artificial Intelligence,17,Computer Science,3,Physical Sciences,doi
1,259039518,,10.1109/lwc.2023.3281881,2306.0538,2023,2023-05-24,ArXiv nonexclusive-distrib,0,https://arxiv.org/abs/2306.05380,0,1,1,4379033818,10.1109/lwc.2023.3281881,en,10764,Privacy-Preserving Technologies in Data,1702,Artificial Intelligence,17,Computer Science,3,Physical Sciences,doi
2,271769360,,10.18653/v1/2024.wassa-1.43,,2024,,unknown-reusability,0,https://aclanthology.org/2024.wassa-1.43,0,1,1,4402670554,10.18653/v1/2024.wassa-1.43,en,10028,Topic Modeling,1702,Artificial Intelligence,17,Computer Science,3,Physical Sciences,doi


Unnamed: 0,c
0,834072


 :: "02_stg".stg_unified_works_metadata_02_joined_to_openalex queries finished | since_start: 18.42 seconds | since_last: 8.07 seconds :: 


' :: "02_stg".stg_unified_works_metadata_02_joined_to_openalex queries finished | since_start: 18.42 seconds | since_last: 8.07 seconds :: '

In [22]:
table_name = 'stg_unified_works_metadata_03_filtered_and_tagged'
db_name = '02_stg'
timelogger = utils.TimeLogger()
utils.create_table_from_sql_file(
    database_name = db_name,
    table_name = table_name,
    overwrite_strategy='overwrite', # options: fail, overwrite, ignore
    wait=True,
)
timelogger.log(f'"{db_name}".{table_name} created')

utils.pd_set_options(cols=300)
display(wr.athena.read_sql_query(f"""SELECT * FROM "{db_name}".{table_name} LIMIT 3 """, db_name))
display(wr.athena.read_sql_query(f"""SELECT COUNT(*) AS c FROM "{db_name}".{table_name} """, db_name))
timelogger.log(f'"{db_name}".{table_name} queries finished')

 :: :: TIMELOGGER STARTED :: | since_start: 0.00 seconds | since_last: 0.00 seconds :: 
Table 02_stg.stg_unified_works_metadata_03_filtered_and_tagged already exists. Overwriting since overwrite_strategy=="overwrite".
Deleting table from Glue Catalog 02_stg stg_unified_works_metadata_03_filtered_and_tagged
Deleting S3 objects from s3://sagemaker-research-methodology-extraction/01_data/02_stg/stg_unified_works_metadata_03_filtered_and_tagged/
s3_parent_target_path:  s3://sagemaker-research-methodology-extraction/01_data/02_stg
 :: "02_stg".stg_unified_works_metadata_03_filtered_and_tagged created | since_start: 6.76 seconds | since_last: 6.76 seconds :: 


Unnamed: 0,id_semanticscholar,id_mag,id_doi,id_arxiv,publication_year,publication_date,license,license_allows_derivative_reuse,source_url,has_id_mag,has_id_doi,has_id_mag_or_doi,openalex_id_openalex,openalex_id_doi,openalex_language,openalex_primary_topic_id,openalex_primary_topic_display_name,openalex_primary_topic_subfield_id,openalex_primary_topic_subfield_display_name,openalex_primary_topic_field_id,openalex_primary_topic_field_display_name,openalex_primary_topic_domain_id,openalex_primary_topic_domain_display_name,openalex_joined_on,bucket_10p,subset
0,218571297,3023155397.0,,2005.04094,2020,2020-05-05,CCBY,1,https://arxiv.org/abs/2005.04094,1,0,1,3023155397,10.48550/arxiv.2005.04094,en,10054,Parallel Computing and Optimization Techniques,1708,Hardware and Architecture,17,Computer Science,3,Physical Sciences,mag,1,test
1,208020257,3101804962.0,10.1007/s10766-019-00646-x,1911.08779,2019,2019-11-15,CCBY,1,https://arxiv.org/abs/1911.08779,1,1,1,3101804962,10.1007/s10766-019-00646-x,en,10054,Parallel Computing and Optimization Techniques,1708,Hardware and Architecture,17,Computer Science,3,Physical Sciences,doi,1,test
2,259736562,,10.1080/19942060.2023.2210196,,2023,2023-06-28,CCBY,1,https://doi.org/10.1080/19942060.2023.2210196,0,1,1,4382395626,10.1080/19942060.2023.2210196,en,10054,Parallel Computing and Optimization Techniques,1708,Hardware and Architecture,17,Computer Science,3,Physical Sciences,doi,1,test


Unnamed: 0,c
0,396430


 :: "02_stg".stg_unified_works_metadata_03_filtered_and_tagged queries finished | since_start: 14.27 seconds | since_last: 7.51 seconds :: 


' :: "02_stg".stg_unified_works_metadata_03_filtered_and_tagged queries finished | since_start: 14.27 seconds | since_last: 7.51 seconds :: '

In [10]:
table_name = 'stg_topics'
db_name = '02_stg'
timelogger = utils.TimeLogger()
utils.create_table_from_sql_file(
    database_name = db_name,
    table_name = table_name,
    overwrite_strategy='overwrite', # options: fail, overwrite, ignore
    wait=True,
)
timelogger.log(f'"{db_name}".{table_name} created')

utils.pd_set_options(cols=300)
display(wr.athena.read_sql_query(f"""SELECT * FROM "{db_name}".{table_name} ORDER BY openalex_primary_topic_count DESC LIMIT 300 """, db_name))
display(wr.athena.read_sql_query(f"""SELECT COUNT(*) AS c FROM "{db_name}".{table_name} """, db_name))
timelogger.log(f'"{db_name}".{table_name} queries finished')

 :: :: TIMELOGGER STARTED :: | since_start: 0.00 seconds | since_last: 0.00 seconds :: 
Deleting S3 objects from s3://sagemaker-research-methodology-extraction/01_data/02_stg/stg_topics/
s3_parent_target_path:  s3://sagemaker-research-methodology-extraction/01_data/02_stg
 :: "02_stg".stg_topics created | since_start: 3.00 seconds | since_last: 3.00 seconds :: 


Unnamed: 0,openalex_primary_topic_index,openalex_primary_topic_id,openalex_primary_topic_display_name,openalex_primary_topic_count,openalex_primary_topic_subfield_id,openalex_primary_topic_subfield_display_name
0,0,10028,Topic Modeling,18734,1702,Artificial Intelligence
1,1,10181,Natural Language Processing Techniques,14656,1702,Artificial Intelligence
2,2,10211,Computational Drug Discovery Methods,11761,1703,Computational Theory and Mathematics
3,3,10020,Quantum Information and Cryptography,8571,1702,Artificial Intelligence
4,4,10682,Quantum Computing Algorithms and Architecture,7384,1702,Artificial Intelligence
5,5,10270,Blockchain Technology Applications and Security,6590,1710,Information Systems
6,6,10036,Advanced Neural Network Applications,6192,1707,Computer Vision and Pattern Recognition
7,7,10862,AI in cancer detection,5038,1702,Artificial Intelligence
8,8,10320,Neural Networks and Applications,4612,1702,Artificial Intelligence
9,9,10273,IoT and Edge/Fog Computing,4543,1705,Computer Networks and Communications


Unnamed: 0,c
0,310


 :: "02_stg".stg_topics queries finished | since_start: 9.69 seconds | since_last: 6.68 seconds :: 


' :: "02_stg".stg_topics queries finished | since_start: 9.69 seconds | since_last: 6.68 seconds :: '

In [11]:
table_name = 'stg_subfields'
db_name = '02_stg'
timelogger = utils.TimeLogger()
utils.create_table_from_sql_file(
    database_name = db_name,
    table_name = table_name,
    overwrite_strategy='overwrite', # options: fail, overwrite, ignore
    wait=True,
)
timelogger.log(f'"{db_name}".{table_name} created')

utils.pd_set_options(cols=300)
display(wr.athena.read_sql_query(f"""SELECT * FROM "{db_name}".{table_name} ORDER BY openalex_primary_topic_subfield_count DESC LIMIT 300 """, db_name))
display(wr.athena.read_sql_query(f"""SELECT COUNT(*) AS c FROM "{db_name}".{table_name} """, db_name))
timelogger.log(f'"{db_name}".{table_name} queries finished')

 :: :: TIMELOGGER STARTED :: | since_start: 0.00 seconds | since_last: 0.00 seconds :: 
Table 02_stg.stg_subfields already exists. Overwriting since overwrite_strategy=="overwrite".
Deleting table from Glue Catalog 02_stg stg_subfields
Deleting S3 objects from s3://sagemaker-research-methodology-extraction/01_data/02_stg/stg_subfields/
s3_parent_target_path:  s3://sagemaker-research-methodology-extraction/01_data/02_stg
 :: "02_stg".stg_subfields created | since_start: 3.30 seconds | since_last: 3.30 seconds :: 


Unnamed: 0,openalex_primary_topic_subfield_index,openalex_primary_topic_subfield_id,openalex_primary_topic_subfield_display_name,openalex_primary_topic_subfield_count
0,0,1702,Artificial Intelligence,151390
1,1,1707,Computer Vision and Pattern Recognition,64993
2,2,1710,Information Systems,54580
3,3,1705,Computer Networks and Communications,42753
4,4,1703,Computational Theory and Mathematics,35684
5,5,1711,Signal Processing,16110
6,6,1709,Human-Computer Interaction,10278
7,7,1706,Computer Science Applications,8975
8,8,1708,Hardware and Architecture,4559
9,9,1704,Computer Graphics and Computer-Aided Design,4400


Unnamed: 0,c
0,11


 :: "02_stg".stg_subfields queries finished | since_start: 9.87 seconds | since_last: 6.57 seconds :: 


' :: "02_stg".stg_subfields queries finished | since_start: 9.87 seconds | since_last: 6.57 seconds :: '

In [12]:
table_name = 'stg_unified_works_metadata_04_with_topics_and_subfields'
db_name = '02_stg'
timelogger = utils.TimeLogger()
utils.create_table_from_sql_file(
    database_name = db_name,
    table_name = table_name,
    overwrite_strategy='overwrite', # options: fail, overwrite, ignore
    wait=True,
)
timelogger.log(f'"{db_name}".{table_name} created')

utils.pd_set_options(cols=300)
display(wr.athena.read_sql_query(f"""SELECT * FROM "{db_name}".{table_name} ORDER BY RANDOM() LIMIT 3 """, db_name))
display(wr.athena.read_sql_query(f"""SELECT COUNT(*) AS c FROM "{db_name}".{table_name} """, db_name))
timelogger.log(f'"{db_name}".{table_name} queries finished')

 :: :: TIMELOGGER STARTED :: | since_start: 0.00 seconds | since_last: 0.00 seconds :: 
Deleting S3 objects from s3://sagemaker-research-methodology-extraction/01_data/02_stg/stg_unified_works_metadata_04_with_topics_and_subfields/
s3_parent_target_path:  s3://sagemaker-research-methodology-extraction/01_data/02_stg
 :: "02_stg".stg_unified_works_metadata_04_with_topics_and_subfields created | since_start: 7.25 seconds | since_last: 7.25 seconds :: 


Unnamed: 0,id_semanticscholar,id_mag,id_doi,id_arxiv,publication_year,publication_date,license,license_allows_derivative_reuse,source_url,has_id_mag,has_id_doi,has_id_mag_or_doi,openalex_id_openalex,openalex_id_doi,openalex_language,openalex_primary_topic_id,openalex_primary_topic_display_name,openalex_primary_topic_count,openalex_primary_topic_index,openalex_primary_topic_subfield_id,openalex_primary_topic_subfield_display_name,openalex_primary_topic_subfield_count,openalex_primary_topic_subfield_index,openalex_primary_topic_field_id,openalex_primary_topic_field_display_name,openalex_primary_topic_domain_id,openalex_primary_topic_domain_display_name,openalex_joined_on,subset
0,271544022,,10.48550/arxiv.2407.20608,2407.20608,2024,2024-07-30,CCBY,1,https://arxiv.org/abs/2407.20608,0,1,1,4401203064,10.48550/arxiv.2407.20608,en,10181,Natural Language Processing Techniques,14656,1,1702,Artificial Intelligence,151390,0,17,Computer Science,3,Physical Sciences,doi,train
1,181913055,2946735810.0,10.3390/bdcc3020030,,2019,2019-05-25,CCBY,1,https://doi.org/10.3390/BDCC3020030,1,1,1,2946735810,10.3390/bdcc3020030,en,10028,Topic Modeling,1,308,1702,Artificial Intelligence,151390,0,17,Computer Science,3,Physical Sciences,doi,train
2,42593093,2556308296.0,10.1007/s10479-016-2367-1,,2016,2016-11-08,CCBY,1,https://doi.org/10.1007/s10479-016-2367-1,1,1,1,2556308296,10.1007/s10479-016-2367-1,en,11063,Rough Sets and Fuzzy Logic,1144,113,1703,Computational Theory and Mathematics,35684,4,17,Computer Science,3,Physical Sciences,doi,test


Unnamed: 0,c
0,433261


 :: "02_stg".stg_unified_works_metadata_04_with_topics_and_subfields queries finished | since_start: 15.54 seconds | since_last: 8.29 seconds :: 


' :: "02_stg".stg_unified_works_metadata_04_with_topics_and_subfields queries finished | since_start: 15.54 seconds | since_last: 8.29 seconds :: '

In [13]:
table_name = 'stg_unified_works_filtered'
timelogger = utils.TimeLogger()
utils.create_table_from_sql_file(
    database_name = '02_stg',
    table_name = table_name,
    overwrite_strategy='overwrite', # options: fail, overwrite, ignore
    wait=True,
)
timelogger.log(f'"02_stg".{table_name} created')

utils.pd_set_options(cols=300)
display(wr.athena.read_sql_query(f"""SELECT * FROM "02_stg".{table_name} LIMIT 3 """, '02_stg'))
display(wr.athena.read_sql_query(f"""SELECT COUNT(*) AS c FROM "02_stg".{table_name} """, '02_stg'))
timelogger.log(f'"02_stg".{table_name} queries finished')

 :: :: TIMELOGGER STARTED :: | since_start: 0.00 seconds | since_last: 0.00 seconds :: 
Table 02_stg.stg_unified_works_filtered already exists. Overwriting since overwrite_strategy=="overwrite".
Deleting table from Glue Catalog 02_stg stg_unified_works_filtered
Deleting S3 objects from s3://sagemaker-research-methodology-extraction/01_data/02_stg/stg_unified_works_filtered/
s3_parent_target_path:  s3://sagemaker-research-methodology-extraction/01_data/02_stg
 :: "02_stg".stg_unified_works_filtered created | since_start: 1.0 minute, 28.09 seconds | since_last: 1.0 minute, 28.09 seconds :: 


Unnamed: 0,id_semanticscholar,id_mag,id_doi,id_arxiv,publication_year,publication_date,license,license_allows_derivative_reuse,source_url,has_id_mag,has_id_doi,has_id_mag_or_doi,openalex_id_openalex,openalex_id_doi,openalex_language,openalex_primary_topic_id,openalex_primary_topic_display_name,openalex_primary_topic_count,openalex_primary_topic_index,openalex_primary_topic_subfield_id,openalex_primary_topic_subfield_display_name,openalex_primary_topic_subfield_count,openalex_primary_topic_subfield_index,openalex_primary_topic_field_id,openalex_primary_topic_field_display_name,openalex_primary_topic_domain_id,openalex_primary_topic_domain_display_name,openalex_joined_on,title,content_abstract,content_text,annotations_paragraph,annotations_section_header,subset
0,209051556,2985924634.0,10.1016/j.ipm.2019.102135,,2020,2020-03-01,CCBY,1,https://doi.org/10.1016/j.ipm.2019.102135,1,1,1,2985924634,10.1016/j.ipm.2019.102135,en,11675,Open Source Software Innovations,757,165,1706,Computer Science Applications,8975,7,17,Computer Science,3,Physical Sciences,doi,Information Technology (IT) enabled crowdsourcing: A conceptual framework,,"\nIntroduction\n\nSocial networking systems allow us to connect easily with one another to communicate, learn, educate, conduct business and solve problems. Advances in connective and collaborative technological environment have enabled individuals to get involved in internet-mediated social par...","[{""attributes"":null,""end"":576,""start"":15},{""attributes"":null,""end"":2123,""start"":590},{""attributes"":null,""end"":3316,""start"":2125},{""attributes"":null,""end"":3831,""start"":3318},{""attributes"":null,""end"":4356,""start"":3833},{""attributes"":null,""end"":4892,""start"":4358},{""attributes"":null,""end"":5824,""star...","[{""attributes"":{""n"":""1.""},""end"":13,""start"":1},{""attributes"":{""n"":""2.""},""end"":588,""start"":578},{""attributes"":{""n"":""3.""},""end"":5955,""start"":5900},{""attributes"":{""n"":""3.1.""},""end"":16287,""start"":16281},{""attributes"":{""n"":""3.2.""},""end"":16945,""start"":16941},{""attributes"":{""n"":""3.2.2.""},""end"":17912,""st...",validation
1,7604401,1980215929.0,10.1371/journal.pone.0103023,1405.4298,2014,2014-05-16,CCBY,1,https://pmc.ncbi.nlm.nih.gov/articles/PMC4118854,1,1,1,1980215929,10.1371/journal.pone.0103023,en,11675,Open Source Software Innovations,757,165,1706,Computer Science Applications,8975,7,17,Computer Science,3,Physical Sciences,doi,How Much Is the Whole Really More than the Sum of Its Parts? 1 ⊞ 1 = 2.5: Superlinear Productivity in Collective Group Actions,"In a variety of open source software projects, we document a superlinear growth of production intensity () as a function of the number of active developers , with a median value of the exponent , with large dispersions of from slightly less than up to . For a typical project in this class, doubl...","\nI. INTRODUCTION\n\nSince at least Aristotle, the adage in the title has permeated human thinking, with prominent influence in psychology (Gestalt theory [1]), biology (brain functions [2], ecological networks [3]), physics (spontaneous symmetry breaking [4] and the ""more is different"" concept ...","[{""attributes"":null,""end"":1345,""start"":18},{""attributes"":null,""end"":2091,""start"":1347},{""attributes"":null,""end"":2677,""start"":2093},{""attributes"":null,""end"":2953,""start"":2679},{""attributes"":null,""end"":3194,""start"":2955},{""attributes"":null,""end"":3534,""start"":3266},{""attributes"":null,""end"":4030,""st...","[{""attributes"":null,""end"":16,""start"":1},{""attributes"":null,""end"":3254,""start"":3196},{""attributes"":null,""end"":3264,""start"":3256},{""attributes"":null,""end"":7724,""start"":7682},{""attributes"":null,""end"":7851,""start"":7794},{""attributes"":null,""end"":10483,""start"":10428},{""attributes"":null,""end"":14015,""st...",train
2,261265554,,10.1007/s00530-023-01167-x,,2023,2023-08-27,CCBY,1,https://doi.org/10.1007/s00530-023-01167-x,0,1,1,4386204001,10.1007/s00530-023-01167-x,en,11478,Caching and Content Delivery,1244,105,1705,Computer Networks and Communications,42753,3,17,Computer Science,3,Physical Sciences,doi,A social-aware video sharing solution using demand prediction of epidemic-based propagation in wireless networks,The video services that account for the majority of global network traffic consume significant amounts of electricity and network resources to meet the large-scale demand of users. Variations in user interest and social influence lead to high maintenance costs for achieving a dynamic balance bet...,"\nIntroduction\n\nThe rapidly evolving wireless communication technologies, such as 5 G, not only enable ubiquitous user access but also enhance network bandwidth to deliver content-rich and high-definition videos [1][2][3][4][5]. Internet video services rely on captivating content and convenien...","[{""attributes"":null,""end"":2032,""start"":15},{""attributes"":null,""end"":3434,""start"":2034},{""attributes"":null,""end"":3999,""start"":3436},{""attributes"":null,""end"":4879,""start"":4001},{""attributes"":null,""end"":5505,""start"":4881},{""attributes"":null,""end"":13552,""start"":5521},{""attributes"":null,""end"":13581,""...","[{""attributes"":{""n"":""1""},""end"":13,""start"":1},{""attributes"":{""n"":""2""},""end"":5519,""start"":5507},{""attributes"":{""n"":""3""},""end"":13575,""start"":13554},{""attributes"":{""n"":""3.1.2""},""end"":14165,""start"":14127},{""attributes"":{""n"":""3.1.3""},""end"":18432,""start"":18392},{""attributes"":{""n"":""3.1.4""},""end"":19660,""...",train


Unnamed: 0,c
0,433261


 :: "02_stg".stg_unified_works_filtered queries finished | since_start: 1.0 minute, 36.41 seconds | since_last: 8.32 seconds :: 


' :: "02_stg".stg_unified_works_filtered queries finished | since_start: 1.0 minute, 36.41 seconds | since_last: 8.32 seconds :: '

In [14]:
table_name = 'unified_works'
db_name = '03_core'
timelogger = utils.TimeLogger()
utils.create_table_from_sql_file(
    database_name = db_name,
    table_name = table_name,
    overwrite_strategy='overwrite', # options: fail, overwrite, ignore
    wait=True,
    s3_parent_target_path=config.S3_CORE_DATA_PATH
)
timelogger.log(f'"{db_name}".{table_name} created')

utils.pd_set_options(cols=100)
display(wr.athena.read_sql_query(f"""SELECT * FROM "{db_name}".{table_name} LIMIT 3 """, db_name))
display(wr.athena.read_sql_query(f"""SELECT COUNT(*) AS c FROM "{db_name}".{table_name} """, db_name))
timelogger.log(f'"{db_name}".{table_name} queries finished')

 :: :: TIMELOGGER STARTED :: | since_start: 0.00 seconds | since_last: 0.00 seconds :: 
Table 03_core.unified_works already exists. Overwriting since overwrite_strategy=="overwrite".
Deleting table from Glue Catalog 03_core unified_works
Deleting S3 objects from s3://sagemaker-research-methodology-extraction/01_data/03_core/unified_works/
s3_parent_target_path:  s3://sagemaker-research-methodology-extraction/01_data/03_core
 :: "03_core".unified_works created | since_start: 40.67 seconds | since_last: 40.67 seconds :: 


Unnamed: 0,id,topic_index,topic_display_name,subfield_index,subfield_display_name,title,abstract,fulltext,subset
0,209051556,165,Open Source Software Innovations,7,Computer Science Applications,Information Technology (IT) enabled crowdsourcing: A conceptual framework,,"\nIntroduction\n\nSocial networking systems allow us to connect easily with one another to communicate, learn, educate, conduct business and solve problems. Advances in connective and collaborative technological environment have enabled individuals to get involved in internet-mediated social par...",validation
1,7604401,165,Open Source Software Innovations,7,Computer Science Applications,How Much Is the Whole Really More than the Sum of Its Parts? 1 ⊞ 1 = 2.5: Superlinear Productivity in Collective Group Actions,"In a variety of open source software projects, we document a superlinear growth of production intensity () as a function of the number of active developers , with a median value of the exponent , with large dispersions of from slightly less than up to . For a typical project in this class, doubl...","\nI. INTRODUCTION\n\nSince at least Aristotle, the adage in the title has permeated human thinking, with prominent influence in psychology (Gestalt theory [1]), biology (brain functions [2], ecological networks [3]), physics (spontaneous symmetry breaking [4] and the ""more is different"" concept ...",train
2,250144849,73,Machine Learning and Data Classification,0,Artificial Intelligence,ZeroC: A Neuro-Symbolic Model for Zero-shot Concept Recognition and Acquisition at Inference Time,"Humans have the remarkable ability to recognize and acquire novel visual concepts in a zero-shot manner. Given a high-level, symbolic description of a novel concept in terms of previously learned visual concepts and their relations, humans can recognize novel concepts without seeing any examples...","\nIntroduction\n\nHumans learn in diverse ways. Besides learning from demonstrations of a novel concept, humans can also learn concepts on a high-level. Consider learning the ""rectangle"" concept, for example. Suppose that one has never seen such a concept, but has already mastered the concept of...",validation


Unnamed: 0,c
0,433261


 :: "03_core".unified_works queries finished | since_start: 49.46 seconds | since_last: 8.79 seconds :: 


' :: "03_core".unified_works queries finished | since_start: 49.46 seconds | since_last: 8.79 seconds :: '

In [15]:
timelogger = utils.TimeLogger()
db_name = '03_core'
table_names = ['unified_works_train', 'unified_works_test', 'unified_works_validation']
for table_name in table_names:
    utils.create_table_from_sql_file(
        database_name = db_name,
        table_name = table_name,
        overwrite_strategy='overwrite', # options: fail, overwrite, ignore
        wait=True,
        s3_parent_target_path=config.S3_CORE_DATA_PATH
    )
    timelogger.log(f'"{db_name}".{table_name} created')
    
    utils.pd_set_options(cols=100)
    display(wr.athena.read_sql_query(f"""SELECT * FROM "{db_name}".{table_name} LIMIT 3 """, db_name))
    display(wr.athena.read_sql_query(f"""SELECT COUNT(*) AS c FROM "{db_name}".{table_name} """, db_name))
    timelogger.log(f'"{db_name}".{table_name} queries finished')

 :: :: TIMELOGGER STARTED :: | since_start: 0.00 seconds | since_last: 0.00 seconds :: 
Table 03_core.unified_works_train already exists. Overwriting since overwrite_strategy=="overwrite".
Deleting table from Glue Catalog 03_core unified_works_train
Deleting S3 objects from s3://sagemaker-research-methodology-extraction/01_data/03_core/unified_works_train/
s3_parent_target_path:  s3://sagemaker-research-methodology-extraction/01_data/03_core
 :: "03_core".unified_works_train created | since_start: 33.65 seconds | since_last: 33.65 seconds :: 


Unnamed: 0,id,topic_index,topic_display_name,subfield_index,subfield_display_name,title,abstract,fulltext,subset
0,267785661,7,AI in cancer detection,0,Artificial Intelligence,BGRD-TransUNet: A Novel TransUNet-Based Model for Ultrasound Breast Lesion Segmentation,Breast UltraSound (BUS) imaging is a commonly used diagnostic tool in the field of counter fight...,"\nI. INTRODUCTION\n\nAccording to the latest global cancer data report [1], from 2015 to 2019, t...",train
1,252873558,7,AI in cancer detection,0,Artificial Intelligence,HoechstGAN: Virtual Lymphocyte Staining Using Generative Adversarial Networks,The presence and density of specific types of immune cells are important to understand a patient...,\nIntroduction\n\nThe UK incidence of kidney cancer is projected to rise by 26% to 32 cases per ...,train
2,273345655,7,AI in cancer detection,0,Artificial Intelligence,Performance Evaluation of Deep Learning and Transformer Models Using Multimodal Data for Breast ...,Rising breast cancer (BC) occurrence and mortality are major global concerns for women. Deep lea...,\nIntroduction\n\nBC is the most prevalent disease among women [20]. Anticipated figures for 202...,train


Unnamed: 0,c
0,346344


 :: "03_core".unified_works_train queries finished | since_start: 42.80 seconds | since_last: 9.15 seconds :: 
Table 03_core.unified_works_test already exists. Overwriting since overwrite_strategy=="overwrite".
Deleting table from Glue Catalog 03_core unified_works_test
Deleting S3 objects from s3://sagemaker-research-methodology-extraction/01_data/03_core/unified_works_test/
s3_parent_target_path:  s3://sagemaker-research-methodology-extraction/01_data/03_core
 :: "03_core".unified_works_test created | since_start: 51.99 seconds | since_last: 9.19 seconds :: 


Unnamed: 0,id,topic_index,topic_display_name,subfield_index,subfield_display_name,title,abstract,fulltext,subset
0,252345140,307,Advanced Neural Network Applications,1,Computer Vision and Pattern Recognition,Position-Aware Anti-Aliasing Filters for 3D Medical Image Analysis,"Maximum pooling, average pooling, and strided convolution are three widely adopted down-sampling...",\nI. INTRODUCTION\n\nDown-sampling has been a fundamental component of digital signal processing...,test
1,252345140,6,Advanced Neural Network Applications,1,Computer Vision and Pattern Recognition,Position-Aware Anti-Aliasing Filters for 3D Medical Image Analysis,"Maximum pooling, average pooling, and strided convolution are three widely adopted down-sampling...",\nI. INTRODUCTION\n\nDown-sampling has been a fundamental component of digital signal processing...,test
2,236088079,307,Advanced Neural Network Applications,1,Computer Vision and Pattern Recognition,Dynamic Transformer for Efficient Machine Translation on Embedded Devices,"The Transformer architecture is widely used for machine translation tasks. However, its resource...",\nI. INTRODUCTION\n\nMachine translation is a fast-growing application of Natural Language Proce...,test


Unnamed: 0,c
0,43478


 :: "03_core".unified_works_test queries finished | since_start: 1.0 minute, 0.19 seconds | since_last: 8.21 seconds :: 
Table 03_core.unified_works_validation already exists. Overwriting since overwrite_strategy=="overwrite".
Deleting table from Glue Catalog 03_core unified_works_validation
Deleting S3 objects from s3://sagemaker-research-methodology-extraction/01_data/03_core/unified_works_validation/
s3_parent_target_path:  s3://sagemaker-research-methodology-extraction/01_data/03_core
 :: "03_core".unified_works_validation created | since_start: 1.0 minute, 9.21 seconds | since_last: 9.01 seconds :: 


Unnamed: 0,id,topic_index,topic_display_name,subfield_index,subfield_display_name,title,abstract,fulltext,subset
0,259014777,88,Coding theory and cryptography,0,Artificial Intelligence,On the exceptionality of rational APN functions,We investigate APN functions which can be represented as rational functions and we provide non-e...,\nIntroduction\n\nLet F q be the finite fields with q = 2 n elements. APN functions in even char...,validation
1,9461708,88,Coding theory and cryptography,0,Artificial Intelligence,"How to obtain lattices from $$(f,\sigma ,\delta )$$(f,σ,δ)-codes via a generalization of Constru...","We show how cyclic $$(f,\sigma ,\delta )$$(f,σ,δ)-codes over finite rings canonically induce a $...",\nIntroduction\n\nRecently several classes of linear codes with a better minimal distance for ce...,validation
2,266961516,135,Video Analysis and Summarization,1,Computer Vision and Pattern Recognition,An enhanced Swin Transformer for soccer player reidentification,"The re-identification (ReID) of objects in images is a widely studied topic in computer vision, ...",\nUsing Swin Transformer as a backbone network to extract image features to address the issue of...,validation


Unnamed: 0,c
0,43439


 :: "03_core".unified_works_validation queries finished | since_start: 1.0 minute, 19.71 seconds | since_last: 10.50 seconds :: 


In [16]:
display(wr.athena.read_sql_query(f"""
    SELECT 
        subset,
        COUNT(*) AS c,
        COUNT(*) * 100.0 / (SELECT COUNT(*) FROM "03_core".unified_works) AS p
    FROM 
        "03_core".unified_works_train
    GROUP BY
        subset
    
    UNION ALL

    SELECT 
        subset,
        COUNT(*) AS c,
        COUNT(*) * 100.0 / (SELECT COUNT(*) FROM "03_core".unified_works) AS p
    FROM 
        "03_core".unified_works_test
    GROUP BY
        subset

    UNION ALL

    SELECT 
        subset,
        COUNT(*) AS c,
        COUNT(*) * 100.0 / (SELECT COUNT(*) FROM "03_core".unified_works) AS p
    FROM 
        "03_core".unified_works_validation
    GROUP BY
        subset
""", db_name))

Unnamed: 0,subset,c,p
0,test,43478,10.03506
1,validation,43439,10.026058
2,train,346344,79.938882


In [17]:
table_name = 'topics'
db_name = '03_core'
timelogger = utils.TimeLogger()
utils.create_table_from_sql_file(
    database_name = db_name,
    table_name = table_name,
    overwrite_strategy='overwrite', # options: fail, overwrite, ignore
    wait=True,
    s3_parent_target_path=config.S3_CORE_DATA_PATH
)
timelogger.log(f'"{db_name}".{table_name} created')

utils.pd_set_options(cols=300)
display(wr.athena.read_sql_query(f"""SELECT * FROM "{db_name}".{table_name} LIMIT 30 """, db_name))
display(wr.athena.read_sql_query(f"""SELECT COUNT(*) AS c FROM "{db_name}".{table_name} """, db_name))
timelogger.log(f'"{db_name}".{table_name} queries finished')

 :: :: TIMELOGGER STARTED :: | since_start: 0.00 seconds | since_last: 0.00 seconds :: 
Deleting S3 objects from s3://sagemaker-research-methodology-extraction/01_data/03_core/topics/
s3_parent_target_path:  s3://sagemaker-research-methodology-extraction/01_data/03_core
 :: "03_core".topics created | since_start: 2.43 seconds | since_last: 2.43 seconds :: 


Unnamed: 0,topic_index,topic_original_id,topic_display_name,topic_count,subfield_original_id,subfield_display_name
0,0,10028,Topic Modeling,18734,1702,Artificial Intelligence
1,1,10181,Natural Language Processing Techniques,14656,1702,Artificial Intelligence
2,2,10211,Computational Drug Discovery Methods,11761,1703,Computational Theory and Mathematics
3,7,10862,AI in cancer detection,5038,1702,Artificial Intelligence
4,8,10320,Neural Networks and Applications,4612,1702,Artificial Intelligence
5,9,10273,IoT and Edge/Fog Computing,4543,1705,Computer Networks and Communications
6,10,10400,Network Security and Intrusion Detection,4440,1705,Computer Networks and Communications
7,31,10215,Semantic Web and Ontologies,2764,1702,Artificial Intelligence
8,32,10100,Metaheuristic Optimization Algorithms Research,2764,1702,Artificial Intelligence
9,33,10201,Speech Recognition and Synthesis,2750,1702,Artificial Intelligence


Unnamed: 0,c
0,310


 :: "03_core".topics queries finished | since_start: 9.37 seconds | since_last: 6.94 seconds :: 


' :: "03_core".topics queries finished | since_start: 9.37 seconds | since_last: 6.94 seconds :: '

In [18]:
table_name = 'subfields'
db_name = '03_core'
timelogger = utils.TimeLogger()
utils.create_table_from_sql_file(
    database_name = db_name,
    table_name = table_name,
    overwrite_strategy='overwrite', # options: fail, overwrite, ignore
    wait=True,
    s3_parent_target_path=config.S3_CORE_DATA_PATH
)
timelogger.log(f'"{db_name}".{table_name} created')

utils.pd_set_options(cols=300)
display(wr.athena.read_sql_query(f"""SELECT * FROM "{db_name}".{table_name} LIMIT 3 """, db_name))
display(wr.athena.read_sql_query(f"""SELECT COUNT(*) AS c FROM "{db_name}".{table_name} """, db_name))
timelogger.log(f'"{db_name}".{table_name} queries finished')

 :: :: TIMELOGGER STARTED :: | since_start: 0.00 seconds | since_last: 0.00 seconds :: 
Table 03_core.subfields already exists. Overwriting since overwrite_strategy=="overwrite".
Deleting table from Glue Catalog 03_core subfields
Deleting S3 objects from s3://sagemaker-research-methodology-extraction/01_data/03_core/subfields/
s3_parent_target_path:  s3://sagemaker-research-methodology-extraction/01_data/03_core
 :: "03_core".subfields created | since_start: 4.35 seconds | since_last: 4.35 seconds :: 


Unnamed: 0,subfield_index,subfield_original_id,subfield_display_name,subfield_count
0,0,1702,Artificial Intelligence,151390
1,3,1705,Computer Networks and Communications,42753
2,4,1703,Computational Theory and Mathematics,35684


Unnamed: 0,c
0,11


 :: "03_core".subfields queries finished | since_start: 10.95 seconds | since_last: 6.60 seconds :: 


' :: "03_core".subfields queries finished | since_start: 10.95 seconds | since_last: 6.60 seconds :: '

In [18]:
utils.pd_set_options(cols=500)

wr.athena.read_sql_query("""
WITH
stg_unified_works_metadata_02_joined_to_openalex_ AS (
    SELECT * FROM "02_stg".stg_unified_works_metadata_02_joined_to_openalex
),
stg_semanticscholar_combined_works_content_ AS (
SELECT * FROM "02_stg".stg_semanticscholar_combined_works_content
),
metadata_filtered AS (
    SELECT 
        * 
    FROM
        stg_unified_works_metadata_02_joined_to_openalex_
    WHERE
        openalex_language='en' AND
        license_allows_derivative_reuse=1
),
numbered AS (
    SELECT
        *,
        NTILE(10) OVER( PARTITION BY openalex_primary_topic_id ORDER BY random()) AS bucket_10p
    FROM
        metadata_filtered
)
SELECT id_semanticscholar, bucket_10p FROM numbered WHERE openalex_primary_topic_id=13932 ORDER BY id_semanticscholar
 """, '01_raw')

Unnamed: 0,id_semanticscholar,bucket_10p
0,18991472,7
1,54460202,2
2,54460599,6
3,54462925,5
4,54574519,1
5,145582524,3
6,159229472,2
7,167039776,10
8,212995807,4
9,216341058,8
