In [2]:
import requests
import boto3
import json
import os
import sys
import argparse
import importlib
import transformers
import torch
import pathlib
import awswrangler as wr
from IPython.display import display
from sagemaker.huggingface.processing import HuggingFaceProcessor
from sagemaker.sklearn.processing import SKLearnProcessor
from sagemaker.processing import FrameworkProcessor
from sagemaker.sklearn.estimator import SKLearn
from sagemaker.workflow.steps import ProcessingStep
from sagemaker.workflow.pipeline_context import PipelineSession
from sagemaker.processing import ProcessingInput, ProcessingOutput
from sagemaker.session import get_execution_role


# Adding ../01_modules or ./01_modules to the system path so that we can load modules from 
# there as well
if '__file__' in globals():
    script_dir = pathlib.Path(__file__).parent.resolve()
else:
    script_dir = pathlib.Path().absolute()
modules_path_in_dev = os.path.abspath(os.path.join(script_dir, '..', '01_modules'))
modules_path_in_prod = os.path.abspath(os.path.join(script_dir, '01_modules'))
if os.path.exists(modules_path_in_dev):
    sys.path.append(modules_path_in_dev)
if os.path.exists(modules_path_in_prod):
    sys.path.append(modules_path_in_prod)


# # Jupyter only reads a local module the first time after 
# # kernel start. Re-running a cell with 
# # "from mymodulename import *" would not change
# # anything, even if the imported module has since changed.
# # As a workaround, we need to directly load the module, 
# # use importlib.reload to reload it and then import * 
import utils
_ = importlib.reload(utils)
import config
_ = importlib.reload(config) 


sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/sagemaker-user/.config/sagemaker/config.yaml
config.py loaded: v0.1
utils.py loaded: v0.2.12
utils.py loaded: v0.2.12
config.py loaded: v0.1


In [3]:
utils.pd_set_options(cols=500)

wr.athena.read_sql_query("""
SELECT
    *
 FROM 
     "01_raw".semanticscholar_s2orc_v2 -- v2 has an _ prefix, it is removed from dowstream models
 LIMIT 5
 """, '01_raw')

Unnamed: 0,corpusid,openaccessinfo,title,authors,body,bibliography
0,39964499,"{'disclaimer': 'This content is derived from https://arxiv.org/abs/1208.0130. ', 'externalids': {'medline': '23214774v1', 'mag': '2081684912', 'acl': None, 'doi': '10.1103/PhysRevE.86.051302', 'medrxiv': None, 'pubmedcentral': None, 'arxiv': '1208.0130'}, 'license': None, 'url': 'https://arxiv.org/abs/1208.0130', 'status': 'GREEN'}",Origin of rebounds with a restitution coefficient larger than unity in nanocluster collisions.,"[H. Kuninaka, H. Hayakawa]","{'text': ' I. INTRODUCTION Nanoclusters are technologically important for the construction of nanodevices. Because the size of nanoclusters is mesoscopic, thermodynamic properties of such materials are still not well understood [1], though the methods to make nanoclusters such as adiabatic expansion through a nozzle and a laser ablation technique are well established. [2] Dynamics of nanoclusters are extensively investigated from both scientific and technological interest. There are many nu...","{'text': 'AcknowledgmentsWe would like to thank N. V. Brilliantov, T. Kawasaki, S. Takesue, R. Murakami, and K. Saitoh for their valuable comments.Also, HK would like to thank Y. Wang and W. Lechner who gave him many advises for calculation of order parameters and how to use their calculation codes.Parts of numerical computation in this work were carried out at Yukawa Institute Computer Facility.This work was supported by the Grant-in-Aid for the Global COE Program ""The Next Generation of Ph..."
1,234595201,"{'disclaimer': 'This content is derived from https://pmc.ncbi.nlm.nih.gov/articles/PMC8100426. Its open-access license is CCBYNCSA.', 'externalids': {'medline': '33995511v1', 'mag': None, 'acl': None, 'doi': '10.11909/j.issn.1671-5411.2021.04.010', 'medrxiv': None, 'pubmedcentral': '8100426', 'arxiv': None}, 'license': 'CCBYNCSA', 'url': 'https://pmc.ncbi.nlm.nih.gov/articles/PMC8100426', 'status': None}",Prevention of self-harm through early detection of depression among the elderly with permanent pacemaker: a case report,"[Yu-Wei Chang, Ju-Yi Chen]","{'text': ' gnosis of sick sinus syndrome. Multiple somatic complaints, especially chronic back pain, had impacted the patient's sleep quality and daily life. The patient also had a diagnosis of general anxiety disorder, and she had been regularly visiting a psychiatric clinic for insomnia over the two years preceding this event. The patient occasionally expressed suicidal ideation but her family did not pay much attention to her suicidal verbalizations. At about 2:00 a.m., during the night...","{'text': 'ACKNOWLEDGMENTSThis study was supported by the Ministry of Science and Technology of Taiwan, China (MOST 108-2218-E-006-019 & MOST 109-2218-E-006-024).All authors had no conflicts of interest to disclose. Depression in older adults. A Fiske, J L Wetherell, M Gatz, Annu Rev Clin Psychol. 52009 . F M Kusumoto, M H Schoenfeld, C Barrett, 2018 ACC/AHA/HRS guideline on the evaluation and management of patients with bradycardia and cardiac conduction delay: a report of the American Col..."
2,14415362,"{'disclaimer': 'This content is derived from https://arxiv.org/abs/0705.0584. ', 'externalids': {'medline': None, 'mag': '2031777933', 'acl': None, 'doi': '10.1007/S00605-008-0535-3', 'medrxiv': None, 'pubmedcentral': None, 'arxiv': '0705.0584'}, 'license': None, 'url': 'https://arxiv.org/abs/0705.0584', 'status': 'CLOSED'}",Multidimensional continued fractions and a Minkowski function,[Giovanni Panti],"{'text': ' Preliminaries The nth order Farey set F n in the real unit interval [0, 1] is defined by recursion: one starts with F 0 = {0/1, 1/1} and obtains F n by adding to F n−1 all the Farey sums v 1 ⊕ v 2 = (a 1 + a 2 )/(b 1 + b 2 ) of two consecutive elements v i = a i /b i of F n−1 . The union of all the F n 's is the set of all rational numbers in [0, 1]. Analogously, by starting with B 0 = F 0 and replacing the Farey sum with the barycentric sum v 1 v 2 = (v 1 + v 2 )/2, we obtain an...","{'text': 'as an index of the singularity of Φ at p.As we already observed, λ Φ[∆ ā t ] = 2 −t .By the Shannon-McMillan-Breiman Theorem[3, §13]we have, for µ-all p (and hence for λ-all p, since µ and λ have the same nullsets), thatwhere h µ is the metrical entropy of M w.r.t.µ.Without loss of generality, we can assume that p is in the topological interior of ∆.For such a p, there exist t 0 and a constantTaking logarithms in ( * * ) we haveFor n = 2 we have h µ ∼ 0.54807 . . .and, as shown in ..."
3,18615301,"{'disclaimer': 'This content is derived from https://arxiv.org/abs/chao-dyn/9808003. ', 'externalids': {'medline': None, 'mag': '3102114047', 'acl': None, 'doi': '10.1021/jp9821836', 'medrxiv': None, 'pubmedcentral': None, 'arxiv': 'chao-dyn/9808003'}, 'license': None, 'url': 'https://arxiv.org/abs/chao-dyn/9808003', 'status': 'GREEN'}",SPECTRAL AUTOCORRELATION FUNCTION IN WEAKLY OPEN CHAOTIC SYSTEMS : INDIRECT PHOTODISSOCIATION OF MOLECULES,"[Y. Alhassid, Y. Fyodorov]","{'text': ' Quantum systems that are classically chaotic are believed to exhibit statistical fluctuations in their spectra and wavefunctions that are universal. 1 These universal properties are well-reproduced by the assumption that the Hamiltonians belong to an ensemble of Hamiltonians that are consistent with the underlying symmetries, but are otherwise random. Such random Hamiltonians are described by random matrix theory (RMT), 2,3 and lead to level repulsion, long range correlations in t...","{'text': ' . * On leave from Petersburg Nuclear Physics Institute. 188350 O Bohigas, Chaos and Quantum Physics, Les-Houches Session LII. M J Giannoni, North Holland, Amsterdam199191 . T A Brody, Rev. Mod. Phys. 533851981 C E Porter, Statistical Theory of Spectra: Fluctuations. New YorkAcademic Press1965 M L Mehta, Random Matrices. New YorkAcademic Press19912nd ed. . Th, H Zimmermann, L S Koppel, G Cederbaum, W Persch, Demtroder, Phys. Rev. Lett. 6131988 . D M Leitner, H Koppel, L S Ced..."
4,255895487,"{'disclaimer': 'This content is derived from https://doi.org/10.3390/en16020804. Its open-access status is GOLD and license is CCBY.', 'externalids': {'medline': None, 'mag': None, 'acl': None, 'doi': '10.3390/en16020804', 'medrxiv': None, 'pubmedcentral': None, 'arxiv': None}, 'license': 'CCBY', 'url': 'https://doi.org/10.3390/en16020804', 'status': 'GOLD'}","Along-Strike Reservoir Development of Steep-Slope Depositional Systems: Case Study from Liushagang Formation in the Weixinan Sag, Beibuwan Basin, South China Sea","[Shengfa Liu, Hongtao Zhu, Qianghu Liu, Ziqiang Zhou, Jiahao Chen]","{'text': ' Introduction Since the concept was first proposed in the 1960s, the fan delta depositional system has received increasing attention. With the development of research, studying the fan delta has gradually deepened from the initial study of sediment characteristics and outcrops to the sedimentary model and fan delta reservoirs [1][2][3][4][5][6]. With the gradual deepening of studies on the fan delta depositional system, we found that the fan delta depositional system is widely dev...",{'text': 'Acknowledgments:We thank the CNOOC (Hainan) for the release of all the data.We thank the two anonymous for reviewing this manuscript and the editorial department for editorial handling and helpful comments.Data Availability Statement:The data that support the findings of this study are available on request from the corresponding authors.The data are not publicly available due to confidentiality restrictions.Funding: This work was supported by Major Outsourcing Projects of China Nat...


In [13]:
wr.athena.read_sql_query("""
SELECT
    corpusid AS id_semanticscholar,
    JSON_EXTRACT_SCALAR(CAST(openaccessinfo AS JSON), '$.externalids.mag') AS id_mag,
    JSON_EXTRACT_SCALAR(CAST(openaccessinfo AS JSON), '$.externalids.doi') AS id_doi,
    JSON_EXTRACT_SCALAR(CAST(openaccessinfo AS JSON), '$.externalids.arxiv') AS id_arxiv,
    title,
    JSON_EXTRACT_SCALAR(CAST(openaccessinfo AS JSON), '$.license') AS license,
    JSON_EXTRACT_SCALAR(CAST(openaccessinfo AS JSON), '$.url') AS source_url,
    JSON_EXTRACT_SCALAR(CAST(openaccessinfo AS JSON), '$.status') AS openaccess_status,
    JSON_EXTRACT_SCALAR(CAST(body AS JSON), '$.text') AS content_text,
    JSON_EXTRACT_SCALAR(CAST(body AS JSON), '$.annotations.paragraph') AS annotations_paragraph,
    JSON_EXTRACT_SCALAR(CAST(body AS JSON), '$.annotations.section_header') AS annotations_section_header
    --'x' AS "x"
 FROM 
     "01_raw".semanticscholar_s2orc_v2
 WHERE
     TRUE
 LIMIT 10
 """, '01_raw')

Unnamed: 0,id_semanticscholar,id_mag,id_doi,id_arxiv,title,license,source_url,openaccess_status,content_text,annotations_paragraph,annotations_section_header
0,234869482,3155033926.0,10.5772/INTECHOPEN.97307,,Thoracoabdominal Compartment Syndrome,CCBY,https://doi.org/10.5772/INTECHOPEN.97307,HYBRID,"\nIntroduction\n\nThe diaphragm is formed from a number of composite origins in the embryo. The most important is the ""septum transversum"", which is a thick mass of cranial mesenchyme that gives rise to parts of the thoracic diaphragm. Without dwelling into more details, the septum transversum merges with mesoderm surrounding the esophagus, the growing pleura and peritoneum ('pleuroperitoneal folds') and the growing muscles of the abdominal wall. The septum transversum gives rise to the cent...","[{""attributes"":null,""end"":612,""start"":15},{""attributes"":null,""end"":730,""start"":614},{""attributes"":null,""end"":999,""start"":732},{""attributes"":null,""end"":1550,""start"":1001},{""attributes"":null,""end"":1777,""start"":1552},{""attributes"":null,""end"":2191,""start"":1779},{""attributes"":null,""end"":2552,""start"":2193},{""attributes"":null,""end"":2851,""start"":2554},{""attributes"":null,""end"":3350,""start"":2880},{""attributes"":null,""end"":3706,""start"":3352},{""attributes"":null,""end"":3930,""start"":3708},{""attributes"":null...","[{""attributes"":{""n"":""1.""},""end"":13,""start"":1},{""attributes"":{""n"":""2.""},""end"":2878,""start"":2853},{""attributes"":{""n"":""3.""},""end"":5257,""start"":5201},{""attributes"":{""n"":""3.1""},""end"":5279,""start"":5259},{""attributes"":{""n"":""3.2""},""end"":6340,""start"":6286},{""attributes"":{""n"":""3.3""},""end"":9066,""start"":9029},{""attributes"":{""n"":""4.""},""end"":10473,""start"":10463}]"
1,270297560,,10.3390/dj12060170,,Periodontal Regeneration of Vital Poor Prognosis Teeth with Attachment Loss Involving the Root Apex: Two Cases with up to 5 Years Follow-Up,CCBY,https://pmc.ncbi.nlm.nih.gov/articles/PMC11202695,GOLD,"\nIntroduction\n\nPeriodontal regeneration is the ultimate goal of periodontal therapy and is defined as the ""restoration of lost or diminished periodontal tissues including cementum, periodontal ligament, and alveolar bone"" [1].Guided tissue regeneration (GTR), on the other hand, refers to a surgical procedure seeking to obtain the objectives of periodontal regeneration through utilising barrier devices or membranes, to exclude epithelial cells and provide space maintenance [1].The clinical...","[{""attributes"":null,""end"":1005,""start"":15},{""attributes"":null,""end"":3444,""start"":1007},{""attributes"":null,""end"":4719,""start"":3469},{""attributes"":null,""end"":5712,""start"":4721},{""attributes"":null,""end"":6359,""start"":5722},{""attributes"":null,""end"":9027,""start"":6369},{""attributes"":null,""end"":9648,""start"":9037},{""attributes"":null,""end"":10673,""start"":9650},{""attributes"":null,""end"":12286,""start"":10683},{""attributes"":null,""end"":12578,""start"":12297},{""attributes"":null,""end"":13477,""start"":12589},{""attr...","[{""attributes"":{""n"":""1.""},""end"":13,""start"":1},{""attributes"":{""n"":""2.""},""end"":3467,""start"":3446},{""attributes"":{""n"":""2.1.""},""end"":5720,""start"":5714},{""attributes"":{""n"":""2.1.""},""end"":6367,""start"":6361},{""attributes"":{""n"":""2.2.""},""end"":9035,""start"":9029},{""attributes"":{""n"":""2.2.""},""end"":10681,""start"":10675},{""attributes"":{""n"":""3.""},""end"":12295,""start"":12288},{""attributes"":{""n"":""3.""},""end"":12587,""start"":12580},{""attributes"":{""n"":""4.""},""end"":13489,""start"":13479},{""attributes"":{""n"":""4.""},""end"":142..."
2,251953573,,10.1103/PhysRevD.108.094029,2208.14827,Entanglement renormalization of the class of continuous matrix product states,CCBY,https://arxiv.org/abs/2208.14827,HYBRID,"\nTensor Network states are the entanglement-based ansatz that has arisen in recent years based on the renormalization group (RG) ideas and later on developed using tools and concepts from quantum information theory.\n\nThe main examples include matrix product states (MPS) [1], projected entangled-pair states (PEPS) [2], and multiscale entanglement renormalization ansatz (MERA) [3].By construction, they obey the entropy/area law [4][5][6][7] and are able to encode both global and local symme...","[{""attributes"":null,""end"":215,""start"":1},{""attributes"":null,""end"":1158,""start"":217},{""attributes"":null,""end"":2915,""start"":1160},{""attributes"":null,""end"":3320,""start"":2917},{""attributes"":null,""end"":4885,""start"":3322},{""attributes"":null,""end"":5277,""start"":4887},{""attributes"":null,""end"":6253,""start"":5279},{""attributes"":null,""end"":6883,""start"":6255},{""attributes"":null,""end"":7469,""start"":6885},{""attributes"":null,""end"":7822,""start"":7471},{""attributes"":null,""end"":7829,""start"":7824},{""attributes"":nu...","[{""attributes"":null,""end"":8125,""start"":8108},{""attributes"":null,""end"":10740,""start"":10703}]"
3,266900080,,,2401.04353,Counting and metrology of distributed atomic clocks using metropolitan fiber,,https://arxiv.org/abs/2401.04353,,"\nIntroduction.\n\nIn the past twenty years, frequency dissemination has been studied extensively and deeply because of its increasingly demand in many different applications, and the current optical frequency transfer with fiber link can achieve the stability of E-21.With the development of quantum frequency standards such as hydrogen maser and fountain clock, the short-term stability of radio frequency has reached 5E-14@1s and the most advanced optical frequency clock such as lattice clock...","[{""attributes"":null,""end"":879,""start"":16},{""attributes"":null,""end"":1985,""start"":881},{""attributes"":null,""end"":2897,""start"":1987},{""attributes"":null,""end"":3963,""start"":2899},{""attributes"":null,""end"":4872,""start"":3965},{""attributes"":null,""end"":8736,""start"":4899},{""attributes"":null,""end"":10195,""start"":8802},{""attributes"":null,""end"":12353,""start"":10241},{""attributes"":null,""end"":13113,""start"":12355}]","[{""attributes"":null,""end"":14,""start"":1},{""attributes"":null,""end"":4897,""start"":4874},{""attributes"":null,""end"":8800,""start"":8738},{""attributes"":null,""end"":10239,""start"":10197}]"
4,267561748,,10.1051/e3sconf/202448605009,,Production of organomineral fertilizers based on local raw materials and nitrogen-fixing microorganisms,CCBY,https://doi.org/10.1051/e3sconf/202448605009,GOLD,\nIntroduction\n\nDue to the rapid increase in the number of people in the world. providing them with sufficient food products is one of the important problems. Adequate use of organomineral fertilizers (OMF) through nitrogen-fixing microorganisms is necessary to solve this problem. By creating an optimal nutrient environment for nitrogen-fixing microorganisms. it is important to research in the direction of achieving the process of maximum nitrogen fixation in the air and creating a flexibl...,"[{""attributes"":null,""end"":567,""start"":15},{""attributes"":null,""end"":1192,""start"":569},{""attributes"":null,""end"":1317,""start"":1194},{""attributes"":null,""end"":1681,""start"":1319},{""attributes"":null,""end"":1915,""start"":1683},{""attributes"":null,""end"":3053,""start"":1917},{""attributes"":null,""end"":3380,""start"":3055},{""attributes"":null,""end"":3655,""start"":3382},{""attributes"":null,""end"":4162,""start"":3657},{""attributes"":null,""end"":4752,""start"":4164},{""attributes"":null,""end"":5287,""start"":4777},{""attributes"":n...","[{""attributes"":{""n"":""1""},""end"":13,""start"":1},{""attributes"":{""n"":""2""},""end"":4775,""start"":4754},{""attributes"":{""n"":""3""},""end"":8230,""start"":8208},{""attributes"":{""n"":""4""},""end"":13197,""start"":13187}]"
5,208033870,2988333020.0,10.1182/bloodadvances.2019000700,,Larotrectinib in TRK fusion-positive pediatric B-cell acute lymphoblastic leukemia.,CCBYNCND,https://doi.org/10.1182/bloodadvances.2019000700,GOLD,"\nIntroduction\n\nRearrangements involving neurotrophic receptor tyrosine kinase (NTRK) genes can generate fusion oncoproteins driving tumor development and survival. 1 NTRK gene fusions have been identified across a range of adult and pediatric solid malignancies. 2 B-cell acute lymphoblastic leukemia (ALL) can harbor an ETV6-NTRK3 gene fusion in ;1% of the so-called ""Philadelphia-like"" cases. 3 ETV6-NTRK3 fusion-positive B-cell ALL is characterized by rapid proliferation and infiltration o...","[{""attributes"":null,""end"":1462,""start"":15},{""attributes"":null,""end"":5737,""start"":1482},{""attributes"":null,""end"":6021,""start"":5748},{""attributes"":null,""end"":8020,""start"":6047},{""attributes"":null,""end"":8031,""start"":8024},{""attributes"":null,""end"":8037,""start"":8035}]","[{""attributes"":null,""end"":13,""start"":1},{""attributes"":null,""end"":1480,""start"":1464},{""attributes"":null,""end"":5746,""start"":5739},{""attributes"":null,""end"":6045,""start"":6023}]"
6,259077538,,10.3389/fphar.2023.1195490,,Curcumin supplementation increases longevity and antioxidant capacity in Caenorhabditis elegans,CCBY,https://pmc.ncbi.nlm.nih.gov/articles/PMC10279890,GOLD,"\nIntroduction\n\nAging and age-related disorders have emerged as significant medical and social concerns. Extensive research has indicated that the aging process is influenced by a complex interplay of various factors, including genetic and epigenetic interactions (Campisi et al., 2019;López-Otín et al., 2023). Aging is characterized by a decrease in various physiological activities of the organism and is accompanied by changes in the internal and external environment of the organism. Senes...","[{""attributes"":null,""end"":1574,""start"":15},{""attributes"":null,""end"":3220,""start"":1576},{""attributes"":null,""end"":4963,""start"":3222},{""attributes"":null,""end"":5749,""start"":4965},{""attributes"":null,""end"":6525,""start"":5818},{""attributes"":null,""end"":7327,""start"":6549},{""attributes"":null,""end"":7518,""start"":7361},{""attributes"":null,""end"":7847,""start"":7520},{""attributes"":null,""end"":8138,""start"":7849},{""attributes"":null,""end"":8472,""start"":8160},{""attributes"":null,""end"":8996,""start"":8552},{""attributes""...","[{""attributes"":{""n"":""1""},""end"":13,""start"":1},{""attributes"":{""n"":""2""},""end"":5772,""start"":5751},{""attributes"":{""n"":""2.1""},""end"":5816,""start"":5774},{""attributes"":{""n"":""2.2""},""end"":6547,""start"":6527},{""attributes"":{""n"":""2.3""},""end"":7359,""start"":7329},{""attributes"":{""n"":""2.4""},""end"":8158,""start"":8140},{""attributes"":{""n"":""2.5""},""end"":8550,""start"":8474},{""attributes"":{""n"":""2.6""},""end"":9027,""start"":8998},{""attributes"":{""n"":""2.6.1""},""end"":9718,""start"":9698},{""attributes"":{""n"":""2.7""},""end"":11665,""star..."
7,36121092,2580988987.0,10.1080/1369183X.2017.1320940,,Socioeconomic success of Asian immigrants in the United States,CCBY,https://doi.org/10.1080/1369183X.2017.1320940,HYBRID,"\nIntroduction\n\nIn recent years, there has been a significant increase in immigration from Asian countries to the United States. Between 2000 and 2015, Asian immigrant populations in the United States grew faster than any other migrant group (Zong and Batalova 2016). This increase has stimulated a growing number of researchers to focus on Asian immigrants although research on Latino migrant populations is still dominating the field of migration studies in the United States, especially when...","[{""attributes"":null,""end"":535,""start"":15},{""attributes"":null,""end"":1918,""start"":537},{""attributes"":null,""end"":2589,""start"":1920},{""attributes"":null,""end"":3020,""start"":2591},{""attributes"":null,""end"":4107,""start"":3041},{""attributes"":null,""end"":5742,""start"":4109},{""attributes"":null,""end"":9034,""start"":5744},{""attributes"":null,""end"":10355,""start"":9036},{""attributes"":null,""end"":10834,""start"":10398},{""attributes"":null,""end"":11729,""start"":10861},{""attributes"":null,""end"":12685,""start"":11743},{""attrib...","[{""attributes"":null,""end"":13,""start"":1},{""attributes"":null,""end"":3039,""start"":3022},{""attributes"":null,""end"":10396,""start"":10357},{""attributes"":null,""end"":10859,""start"":10836},{""attributes"":null,""end"":11741,""start"":11731},{""attributes"":null,""end"":12912,""start"":12891},{""attributes"":null,""end"":13190,""start"":13180},{""attributes"":null,""end"":14560,""start"":14526},{""attributes"":null,""end"":15183,""start"":15123},{""attributes"":null,""end"":16637,""start"":16614},{""attributes"":null,""end"":16659,""start"":16639..."
8,260400257,,10.3389/fendo.2023.1233685,,Duration of oestrogen exposure does not affect reproductive outcome in artificial cycles: a retrospective analysis of more than 7000 hormonal replacement therapy cycles for an embryo transfer,CCBY,https://pmc.ncbi.nlm.nih.gov/articles/PMC10430777,GOLD,"\nIntroduction\n\nArtificial endometrial preparation with hormonal replacement therapy (HRT) is frequently used for frozen embryo transfer (FET) and egg donation cycles (1). This protocol involves the administration of exogenous estrogen and progesterone trying to mimic the hormonal changes happening physiologically in a natural cycle. \n\nWhereas the number of days of progesterone administration until the embryo transfer (ET) is clearly defined according to the embryo development stage, the...","[{""attributes"":null,""end"":335,""start"":15},{""attributes"":null,""end"":1096,""start"":337},{""attributes"":null,""end"":1728,""start"":1098},{""attributes"":null,""end"":2408,""start"":1730},{""attributes"":null,""end"":2695,""start"":2410},{""attributes"":null,""end"":3114,""start"":2697},{""attributes"":null,""end"":3460,""start"":3116},{""attributes"":null,""end"":4055,""start"":3462},{""attributes"":null,""end"":4298,""start"":4100},{""attributes"":null,""end"":4858,""start"":4318},{""attributes"":null,""end"":5227,""start"":4871},{""attributes"":n...","[{""attributes"":{""n"":""1""},""end"":13,""start"":1},{""attributes"":{""n"":""2""},""end"":4078,""start"":4057},{""attributes"":{""n"":""2.1""},""end"":4098,""start"":4080},{""attributes"":{""n"":""2.1.1""},""end"":4316,""start"":4300},{""attributes"":{""n"":""2.1.2""},""end"":4869,""start"":4860},{""attributes"":{""n"":""2.2""},""end"":6852,""start"":6808},{""attributes"":{""n"":""2.2.2""},""end"":8396,""start"":8382},{""attributes"":{""n"":""2.3""},""end"":9261,""start"":9241},{""attributes"":{""n"":""3""},""end"":10434,""start"":10427},{""attributes"":{""n"":""3.1""},""end"":10456,""..."
9,15956973,2469794930.0,10.1038/aps.2015.166,,Protrusion-localized STAT3 mRNA promotes metastasis of highly metastatic hepatocellular carcinoma cells in vitro,,https://pmc.ncbi.nlm.nih.gov/articles/PMC4954761,BRONZE,"\nIntroduction\n\nHepatocellular carcinoma (HCC) is the fourth most common malignant cancer in the world. Approximately 250 000 people worldwide die of HCC each year [1,2] . More than 60% of patients who suffer from HCC are not diagnosed at an early stage because of the unclear pathogenesis of the disease, a hallmark of HCC. A delayed diagnosis of HCC results in only a 5% survival rate after 5 years [3] . Most cancer-associated mortality is related to the metastasis of malignant cancer cells...","[{""attributes"":null,""end"":1868,""start"":15},{""attributes"":null,""end"":2771,""start"":1870},{""attributes"":null,""end"":3725,""start"":2773},{""attributes"":null,""end"":4910,""start"":3750},{""attributes"":null,""end"":5950,""start"":4912},{""attributes"":null,""end"":7272,""start"":6002},{""attributes"":null,""end"":8683,""start"":7274},{""attributes"":null,""end"":9806,""start"":8685},{""attributes"":null,""end"":10110,""start"":9830},{""attributes"":null,""end"":11872,""start"":10193},{""attributes"":null,""end"":13985,""start"":11874},{""attrib...","[{""attributes"":null,""end"":13,""start"":1},{""attributes"":null,""end"":3748,""start"":3727},{""attributes"":null,""end"":6000,""start"":5952},{""attributes"":null,""end"":9828,""start"":9808},{""attributes"":null,""end"":10119,""start"":10112},{""attributes"":null,""end"":10191,""start"":10121},{""attributes"":null,""end"":14687,""start"":14637},{""attributes"":null,""end"":16126,""start"":16045},{""attributes"":null,""end"":19414,""start"":19404}]"


In [17]:
utils.pd_set_options()
wr.athena.read_sql_query("""
WITH
raw_semanticscholar_s2orcv2 AS 
(
    SELECT * FROM "01_raw".semanticscholar_s2orc_v2 LIMIT 5000
),
base_semanticscholar_s2orcv2 AS 
(
    SELECT
        corpusid AS id_semanticscholar,
        JSON_EXTRACT_SCALAR(CAST(openaccessinfo AS JSON), '$.externalids.mag') AS id_mag,
        JSON_EXTRACT_SCALAR(CAST(openaccessinfo AS JSON), '$.externalids.doi') AS id_doi,
        JSON_EXTRACT_SCALAR(CAST(openaccessinfo AS JSON), '$.externalids.arxiv') AS id_arxiv,
        title,
        JSON_EXTRACT_SCALAR(CAST(openaccessinfo AS JSON), '$.license') AS license,
        JSON_EXTRACT_SCALAR(CAST(openaccessinfo AS JSON), '$.url') AS source_url,
        JSON_EXTRACT_SCALAR(CAST(openaccessinfo AS JSON), '$.status') AS openaccess_status,
        JSON_EXTRACT_SCALAR(CAST(body AS JSON), '$.text') AS content_text,
        JSON_EXTRACT_SCALAR(CAST(body AS JSON), '$.annotations.paragraph') AS annotations_paragraph,
        JSON_EXTRACT_SCALAR(CAST(body AS JSON), '$.annotations.section_header') AS annotations_section_header
        --'x' AS "x"
     FROM 
         raw_semanticscholar_s2orcv2
)
SELECT * FROM base_semanticscholar_s2orcv2 LIMIT 5
""", '01_raw')

Unnamed: 0,id_semanticscholar,id_mag,id_doi,id_arxiv,title,license,source_url,openaccess_status,content_text,annotations_paragraph,annotations_section_header
0,26516356,1522375164.0,10.1016/S0065-3527(08)60087-1,,Pathogenesis of Virus-Induced Demyelination,unspecified-oa,https://pmc.ncbi.nlm.nih.gov/articles/PMC7131186,GREEN,"\nI. INTRODUCTION\n\nA . Demyelinating Diseases of Humans\n\nDemyelination is the loss of myelin, the lipid sheath surrounding the neuronal axon. Myelin loss can occur following direct damage of the myelin (primary demyelination) or secondary to neuronal damage and axonal loss (secondary demyelination). Myelin is produced by oligodendrocytes in the central nervous system (CNS) and Schwann cells in the peripheral nervous system (PNS). The biochemistry of these two forms is different.\n\nDemyelination is a component of several viral diseases of humans. The best known of these are subacute sclerosing panencephalitis (SSPE) of children, produced by measles virus, and progressive multifocal leukoencephalopathy (PML), produced in adults and children by J C papovavirus. Both are rare conditions, SSPE being a late onset complication of measles and PML occurring in association with immunosuppression. In addition to these two well-described examples of viral demyelination there are several o...","[{""attributes"":null,""end"":482,""start"":56},{""attributes"":null,""end"":1402,""start"":484},{""attributes"":null,""end"":2380,""start"":1404},{""attributes"":null,""end"":3169,""start"":2382},{""attributes"":null,""end"":3750,""start"":3171},{""attributes"":null,""end"":5696,""start"":3794},{""attributes"":null,""end"":6952,""start"":5698},{""attributes"":null,""end"":7956,""start"":6954},{""attributes"":null,""end"":11482,""start"":7958},{""attributes"":null,""end"":12764,""start"":11484},{""attributes"":null,""end"":14212,""start"":12766},{""attributes"":null,""end"":15022,""start"":14260},{""attributes"":null,""end"":15630,""start"":15024},{""attributes"":null,""end"":15913,""start"":15632},{""attributes"":null,""end"":17206,""start"":15915},{""attributes"":null,""end"":18164,""start"":17208},{""attributes"":null,""end"":19592,""start"":18166},{""attributes"":null,""end"":20406,""start"":19594},{""attributes"":null,""end"":21218,""start"":20408},{""attributes"":null,""end"":21932,""start"":21220},{""attributes"":null,""end"":22222,""start"":21974},{""attributes"":null,""end"":23482,""start"":22224},{""at...","[{""attributes"":null,""end"":16,""start"":1},{""attributes"":null,""end"":54,""start"":18},{""attributes"":null,""end"":3792,""start"":3752},{""attributes"":null,""end"":14258,""start"":14214},{""attributes"":null,""end"":21972,""start"":21934},{""attributes"":{""n"":""11.""},""end"":26720,""start"":26684},{""attributes"":null,""end"":26747,""start"":26722},{""attributes"":null,""end"":40086,""start"":40067},{""attributes"":null,""end"":53469,""start"":53445},{""attributes"":null,""end"":65266,""start"":65251},{""attributes"":null,""end"":74486,""start"":74461},{""attributes"":null,""end"":79297,""start"":79247},{""attributes"":{""n"":""111.""},""end"":83021,""start"":82993},{""attributes"":null,""end"":83061,""start"":83023},{""attributes"":null,""end"":84638,""start"":84599},{""attributes"":null,""end"":86618,""start"":86481},{""attributes"":null,""end"":88423,""start"":88377},{""attributes"":null,""end"":90291,""start"":90253},{""attributes"":null,""end"":94586,""start"":94523},{""attributes"":null,""end"":95520,""start"":95499},{""attributes"":null,""end"":96494,""start"":96442},{""attributes"":null,""end"":9654..."
1,7628358,2271775406.0,10.4103/0301-4738.176038,,A case of giant nodular posterior scleritis mimicking choroidal malignancy,CCBYNCSA,https://pmc.ncbi.nlm.nih.gov/articles/PMC4784081,GOLD,"\nPosterior scleritis is often an under-recognized entity given its low incidence and variable clinical manifestations. It comprises 10% of all cases of scleritis and is associated with systemic diseases in up to one-third of the cases. 17% of the patients have no detectable physical signs on the first examination. [1] A retrospective review of patients attending the Moorfields Eye Hospital in United Kingdom between 1974 and 1996 showed that posterior scleritis was twice as common in women as in men. The mean age at onset was 49 years. [1] Periocular pain, headache, and visual loss were common presenting symptoms. [1,2] A high index of suspicion is necessary to detect this potentially sight-threatening disease early in its course so that effective therapy can be administered to limit visual loss. Despite growing experiences and diagnostic advances, nodular posterior scleritis continues to be a diagnostic challenge as it can often mimic choroidal melanoma clinically. In a large revi...","[{""attributes"":null,""end"":1325,""start"":1},{""attributes"":null,""end"":2073,""start"":1340},{""attributes"":null,""end"":2499,""start"":2075},{""attributes"":null,""end"":3314,""start"":2501},{""attributes"":null,""end"":4005,""start"":3316},{""attributes"":null,""end"":4512,""start"":4007},{""attributes"":null,""end"":5063,""start"":4526},{""attributes"":null,""end"":7475,""start"":5065},{""attributes"":null,""end"":7517,""start"":7512},{""attributes"":null,""end"":8094,""start"":7715},{""attributes"":null,""end"":8389,""start"":8096},{""attributes"":null,""end"":8677,""start"":8554},{""attributes"":null,""end"":9388,""start"":8679},{""attributes"":null,""end"":9393,""start"":9392},{""attributes"":null,""end"":9398,""start"":9397}]","[{""attributes"":null,""end"":1338,""start"":1327},{""attributes"":null,""end"":4524,""start"":4514},{""attributes"":null,""end"":7510,""start"":7477},{""attributes"":null,""end"":7621,""start"":7519},{""attributes"":null,""end"":7713,""start"":7623},{""attributes"":null,""end"":8552,""start"":8391}]"
2,252820925,,10.3389/fneur.2022.967077,,Prevalence and risk factors of MRI-defined brain infarcts among Chinese adults,CCBY,https://pmc.ncbi.nlm.nih.gov/articles/PMC9597681,GOLD,"\nIntroduction\n\nChina currently carries the world's largest burden of stroke, which has become a major public health challenge (1,2). However, overt stroke, easily recognized clinically, represents only the tip of the iceberg. In contrast, silent brain infarcts (SBI) are often ignored and represent the larger below the surface of the water (3). With the development of brainimaging techniques, brain abnormalities are commonly found using brain magnetic resonance imaging (MRI) (4,5). The prevalence of MRI-defined BI exceeds, by far, the prevalence of symptomatic stroke (6). Although the majority of MRI-defined BI were covert without clinical stroke symptoms (7), they are highly valuable in predicting subsequent risk of symptomatic stroke, dementia, and mortality (8,9). However, few studies regarding the epidemiology of MRI-defined BI have been conducted in China, and previous studies were limited by small sample sizes or certain geographic regions only (7,10). Meanwhile, the associ...","[{""attributes"":null,""end"":1159,""start"":15},{""attributes"":null,""end"":1611,""start"":1161},{""attributes"":null,""end"":2495,""start"":1653},{""attributes"":null,""end"":3434,""start"":2497},{""attributes"":null,""end"":3665,""start"":3436},{""attributes"":null,""end"":5275,""start"":3709},{""attributes"":null,""end"":5548,""start"":5303},{""attributes"":null,""end"":6500,""start"":5550},{""attributes"":null,""end"":7626,""start"":6524},{""attributes"":null,""end"":7828,""start"":7628},{""attributes"":null,""end"":7964,""start"":7849},{""attributes"":null,""end"":8703,""start"":8014},{""attributes"":null,""end"":9012,""start"":8705},{""attributes"":null,""end"":9720,""start"":9056},{""attributes"":null,""end"":11391,""start"":9722},{""attributes"":null,""end"":11727,""start"":11562},{""attributes"":null,""end"":11741,""start"":11729},{""attributes"":null,""end"":12872,""start"":11960},{""attributes"":null,""end"":13808,""start"":12874},{""attributes"":null,""end"":13930,""start"":13810},{""attributes"":null,""end"":15020,""start"":13932},{""attributes"":null,""end"":15242,""start"":15022},{""attributes"":...","[{""attributes"":null,""end"":13,""start"":1},{""attributes"":null,""end"":1620,""start"":1613},{""attributes"":null,""end"":1651,""start"":1622},{""attributes"":null,""end"":3707,""start"":3667},{""attributes"":null,""end"":5301,""start"":5277},{""attributes"":null,""end"":6522,""start"":6502},{""attributes"":null,""end"":7847,""start"":7830},{""attributes"":null,""end"":7973,""start"":7966},{""attributes"":null,""end"":8012,""start"":7975},{""attributes"":null,""end"":9054,""start"":9014},{""attributes"":null,""end"":11408,""start"":11393},{""attributes"":null,""end"":11528,""start"":11410},{""attributes"":null,""end"":11560,""start"":11530},{""attributes"":null,""end"":11863,""start"":11743},{""attributes"":null,""end"":11880,""start"":11865},{""attributes"":null,""end"":11946,""start"":11882},{""attributes"":null,""end"":11958,""start"":11948},{""attributes"":null,""end"":20989,""start"":20962},{""attributes"":null,""end"":21126,""start"":21110},{""attributes"":null,""end"":21491,""start"":21471},{""attributes"":null,""end"":21798,""start"":21791}]"
3,21632151,2589746172.0,10.1016/j.hrcr.2017.01.002,,Spatiotemporal characterization of the transition from sinus rhythm to ventricular fibrillation during an acute ischemic event in the intact human heart by whole-heart sock-mapping,CCBYNCND,https://pmc.ncbi.nlm.nih.gov/articles/PMC5509912,GOLD,"\nIntroduction\n\nThe accurate characterization of the spatiotemporal pattern of electrical activation and repolarization during ischemia is relevant for diagnosis and sudden cardiac death prediction. Although several animal studies have comprehensively described changes in acute ischemic models in the dog, 1,2 cat, 3 and pig, 4 data from whole-heart mapping of acute ischemia in the in vivo human heart are scant and are needed to complement animal models.\n\nCase report\n\nA 55-year-old man with chronic total occlusion of the right coronary artery with retrograde filling, 50% left anterior descending artery disease, mild left anterior descending diagonal re-stenosis of stents (inserted 10 years previously), and occlusion of distal circumflex artery was referred for coronary artery bypass grafting at the Heart Hospital, London, UK.\n\nEchocardiographic analysis reported normal left ventricle (LV) and right ventricle size and systolic function and mild mitral and tricuspid regurgitat...","[{""attributes"":null,""end"":456,""start"":15},{""attributes"":null,""end"":835,""start"":471},{""attributes"":null,""end"":991,""start"":837},{""attributes"":null,""end"":1290,""start"":993},{""attributes"":null,""end"":1583,""start"":1292},{""attributes"":null,""end"":2234,""start"":1585},{""attributes"":null,""end"":2864,""start"":2236},{""attributes"":null,""end"":3079,""start"":2866},{""attributes"":null,""end"":6411,""start"":3081},{""attributes"":null,""end"":7505,""start"":6413},{""attributes"":null,""end"":7731,""start"":7528},{""attributes"":null,""end"":7989,""start"":7733},{""attributes"":null,""end"":8183,""start"":7991},{""attributes"":null,""end"":8453,""start"":8185},{""attributes"":null,""end"":10145,""start"":8467},{""attributes"":null,""end"":10515,""start"":10147},{""attributes"":null,""end"":10942,""start"":10517},{""attributes"":null,""end"":11428,""start"":10944},{""attributes"":null,""end"":11949,""start"":11442}]","[{""attributes"":null,""end"":13,""start"":1},{""attributes"":null,""end"":469,""start"":458},{""attributes"":null,""end"":7526,""start"":7507},{""attributes"":null,""end"":8465,""start"":8455},{""attributes"":null,""end"":11440,""start"":11430}]"
4,247428073,,10.1007/s11192-022-04320-x,,"Choice and allocation characteristics of faculty time in Korea: effects of tenure, research performance, and external shock",,https://pmc.ncbi.nlm.nih.gov/articles/PMC8916952,BRONZE,"\nIntroduction\n\nResearch output is of foremost importance to academics because it determines their job stability in most cases (e.g., Harter et al., 2011;Kasten, 1984) in addition to their social and economic value (e.g., Fairweather, 2002). Having been considered as a significant input factor to the research output, research time has also been closely related to job satisfaction of academics (Barham et al., 2014). \n\nHowever, an academic with a balanced workload (e.g., research, teaching, and so forth) cannot commit all the work time 1 to research. An academic, considering both institutional requirements and individual environments, allocates his or her work time to perform administrative work as well as research, teaching students, and sometimes participating in non-academic activities (Barham et al., 2014;Bentley & Kyvik, 2013;Harter et al., 2011). \n\nPrevious studies reported that such work time allocation of an academic is affected by various factors. They include incentiv...","[{""attributes"":null,""end"":418,""start"":15},{""attributes"":null,""end"":862,""start"":420},{""attributes"":null,""end"":1529,""start"":864},{""attributes"":null,""end"":3818,""start"":1531},{""attributes"":null,""end"":4144,""start"":3820},{""attributes"":null,""end"":4773,""start"":4146},{""attributes"":null,""end"":5382,""start"":4796},{""attributes"":null,""end"":6720,""start"":5384},{""attributes"":null,""end"":7754,""start"":6722},{""attributes"":null,""end"":9238,""start"":7756},{""attributes"":null,""end"":10323,""start"":9240},{""attributes"":null,""end"":11158,""start"":10325},{""attributes"":null,""end"":12272,""start"":11160},{""attributes"":null,""end"":13452,""start"":12288},{""attributes"":null,""end"":15049,""start"":13462},{""attributes"":null,""end"":15149,""start"":15051},{""attributes"":null,""end"":16251,""start"":15179},{""attributes"":null,""end"":16380,""start"":16253},{""attributes"":null,""end"":16511,""start"":16382},{""attributes"":null,""end"":17135,""start"":16513},{""attributes"":null,""end"":17410,""start"":17137},{""attributes"":null,""end"":17861,""start"":17412},{""attribut...","[{""attributes"":null,""end"":13,""start"":1},{""attributes"":null,""end"":4794,""start"":4775},{""attributes"":null,""end"":12286,""start"":12274},{""attributes"":null,""end"":13460,""start"":13454},{""attributes"":null,""end"":15177,""start"":15151},{""attributes"":null,""end"":18530,""start"":18512},{""attributes"":null,""end"":20823,""start"":20790},{""attributes"":null,""end"":26305,""start"":26296},{""attributes"":null,""end"":28247,""start"":28232},{""attributes"":null,""end"":29837,""start"":29804},{""attributes"":null,""end"":29855,""start"":29839},{""attributes"":null,""end"":32509,""start"":32470},{""attributes"":null,""end"":34121,""start"":34075},{""attributes"":null,""end"":34984,""start"":34946},{""attributes"":null,""end"":36742,""start"":36682},{""attributes"":null,""end"":39184,""start"":39164},{""attributes"":null,""end"":40641,""start"":40623}]"


In [10]:
utils.pd_set_options()
wr.athena.read_sql_query("""
WITH
base_openalex_works_reduced AS ()
grouped AS (
    SELECT
        --primary_topic_short_id,
        --primary_topic_display_name,
        primary_topic_subfield_short_id,
        primary_topic_subfield_long_id,
        primary_topic_subfield_display_name,
        primary_topic_field_short_id,
        primary_topic_field_long_id,
        primary_topic_field_display_name,
        primary_topic_domain_short_id,
        primary_topic_domain_long_id,
        primary_topic_domain_display_name,
        COUNT(*) AS c
     FROM
        openalex_works_reduced
     GROUP BY
        primary_topic_subfield_short_id,
        primary_topic_subfield_long_id,
        primary_topic_subfield_display_name,
        primary_topic_field_short_id,
        primary_topic_field_long_id,
        primary_topic_field_display_name,
        primary_topic_domain_short_id,
        primary_topic_domain_long_id,
        primary_topic_domain_display_name
)
SELECT 
    *
FROM
    grouped
ORDER BY
    primary_topic_domain_long_id,
    primary_topic_domain_short_id,
    primary_topic_field_long_id,
    primary_topic_field_short_id,
    primary_topic_subfield_long_id,
    primary_topic_subfield_short_id
 """, '01_raw')

Unnamed: 0,primary_topic_subfield_short_id,primary_topic_subfield_long_id,primary_topic_subfield_display_name,primary_topic_field_short_id,primary_topic_field_long_id,primary_topic_field_display_name,primary_topic_domain_short_id,primary_topic_domain_long_id,primary_topic_domain_display_name,c
0,,1102,Agronomy and Crop Science,,11,Agricultural and Biological Sciences,,1,Life Sciences,1
1,,1103,Animal Science and Zoology,,11,Agricultural and Biological Sciences,,1,Life Sciences,2
2,,1105,"Ecology, Evolution, Behavior and Systematics",,11,Agricultural and Biological Sciences,,1,Life Sciences,2
3,,1109,Insect Science,,11,Agricultural and Biological Sciences,,1,Life Sciences,1
4,,1304,Biophysics,,13,"Biochemistry, Genetics and Molecular Biology",,1,Life Sciences,1
5,,1310,Endocrinology,,13,"Biochemistry, Genetics and Molecular Biology",,1,Life Sciences,1
6,,1311,Genetics,,13,"Biochemistry, Genetics and Molecular Biology",,1,Life Sciences,2
7,,1312,Molecular Biology,,13,"Biochemistry, Genetics and Molecular Biology",,1,Life Sciences,3
8,,2403,Immunology,,24,Immunology and Microbiology,,1,Life Sciences,2
9,,1207,History and Philosophy of Science,,12,Arts and Humanities,,2,Social Sciences,1
