In [1]:
import requests
import boto3
import json
import os
import sys
import argparse
import importlib
import transformers
import torch
import pathlib
import awswrangler as wr
from IPython.display import display
from sagemaker.huggingface.processing import HuggingFaceProcessor
from sagemaker.sklearn.processing import SKLearnProcessor
from sagemaker.processing import FrameworkProcessor
from sagemaker.sklearn.estimator import SKLearn
from sagemaker.workflow.steps import ProcessingStep
from sagemaker.workflow.pipeline_context import PipelineSession
from sagemaker.processing import ProcessingInput, ProcessingOutput
from sagemaker.session import get_execution_role


# Adding ../01_modules or ./01_modules to the system path so that we can load modules from 
# there as well
if '__file__' in globals():
    script_dir = pathlib.Path(__file__).parent.resolve()
else:
    script_dir = pathlib.Path().absolute()
modules_path_in_dev = os.path.abspath(os.path.join(script_dir, '..', '01_modules'))
modules_path_in_prod = os.path.abspath(os.path.join(script_dir, '01_modules'))
if os.path.exists(modules_path_in_dev):
    sys.path.append(modules_path_in_dev)
if os.path.exists(modules_path_in_prod):
    sys.path.append(modules_path_in_prod)


# # Jupyter only reads a local module the first time after 
# # kernel start. Re-running a cell with 
# # "from mymodulename import *" would not change
# # anything, even if the imported module has since changed.
# # As a workaround, we need to directly load the module, 
# # use importlib.reload to reload it and then import * 
import utils
_ = importlib.reload(utils)
import config
_ = importlib.reload(config) 


sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/sagemaker-user/.config/sagemaker/config.yaml
config.py loaded: v0.1
utils.py loaded: v0.2.12
utils.py loaded: v0.2.12
config.py loaded: v0.1


In [3]:
utils.pd_set_options(cols=500)

wr.athena.read_sql_query("""
SELECT
    *
 FROM 
     "01_raw".semanticscholar_s2orc_v2 -- v2 has an _ prefix, it is removed from dowstream models
 LIMIT 5
 """, '01_raw')

Unnamed: 0,corpusid,openaccessinfo,title,authors,body,bibliography
0,39964499,"{'disclaimer': 'This content is derived from https://arxiv.org/abs/1208.0130. ', 'externalids': {'medline': '23214774v1', 'mag': '2081684912', 'acl': None, 'doi': '10.1103/PhysRevE.86.051302', 'medrxiv': None, 'pubmedcentral': None, 'arxiv': '1208.0130'}, 'license': None, 'url': 'https://arxiv.org/abs/1208.0130', 'status': 'GREEN'}",Origin of rebounds with a restitution coefficient larger than unity in nanocluster collisions.,"[H. Kuninaka, H. Hayakawa]","{'text': ' I. INTRODUCTION Nanoclusters are technologically important for the construction of nanodevices. Because the size of nanoclusters is mesoscopic, thermodynamic properties of such materials are still not well understood [1], though the methods to make nanoclusters such as adiabatic expansion through a nozzle and a laser ablation technique are well established. [2] Dynamics of nanoclusters are extensively investigated from both scientific and technological interest. There are many nu...","{'text': 'AcknowledgmentsWe would like to thank N. V. Brilliantov, T. Kawasaki, S. Takesue, R. Murakami, and K. Saitoh for their valuable comments.Also, HK would like to thank Y. Wang and W. Lechner who gave him many advises for calculation of order parameters and how to use their calculation codes.Parts of numerical computation in this work were carried out at Yukawa Institute Computer Facility.This work was supported by the Grant-in-Aid for the Global COE Program ""The Next Generation of Ph..."
1,234595201,"{'disclaimer': 'This content is derived from https://pmc.ncbi.nlm.nih.gov/articles/PMC8100426. Its open-access license is CCBYNCSA.', 'externalids': {'medline': '33995511v1', 'mag': None, 'acl': None, 'doi': '10.11909/j.issn.1671-5411.2021.04.010', 'medrxiv': None, 'pubmedcentral': '8100426', 'arxiv': None}, 'license': 'CCBYNCSA', 'url': 'https://pmc.ncbi.nlm.nih.gov/articles/PMC8100426', 'status': None}",Prevention of self-harm through early detection of depression among the elderly with permanent pacemaker: a case report,"[Yu-Wei Chang, Ju-Yi Chen]","{'text': ' gnosis of sick sinus syndrome. Multiple somatic complaints, especially chronic back pain, had impacted the patient's sleep quality and daily life. The patient also had a diagnosis of general anxiety disorder, and she had been regularly visiting a psychiatric clinic for insomnia over the two years preceding this event. The patient occasionally expressed suicidal ideation but her family did not pay much attention to her suicidal verbalizations. At about 2:00 a.m., during the night...","{'text': 'ACKNOWLEDGMENTSThis study was supported by the Ministry of Science and Technology of Taiwan, China (MOST 108-2218-E-006-019 & MOST 109-2218-E-006-024).All authors had no conflicts of interest to disclose. Depression in older adults. A Fiske, J L Wetherell, M Gatz, Annu Rev Clin Psychol. 52009 . F M Kusumoto, M H Schoenfeld, C Barrett, 2018 ACC/AHA/HRS guideline on the evaluation and management of patients with bradycardia and cardiac conduction delay: a report of the American Col..."
2,14415362,"{'disclaimer': 'This content is derived from https://arxiv.org/abs/0705.0584. ', 'externalids': {'medline': None, 'mag': '2031777933', 'acl': None, 'doi': '10.1007/S00605-008-0535-3', 'medrxiv': None, 'pubmedcentral': None, 'arxiv': '0705.0584'}, 'license': None, 'url': 'https://arxiv.org/abs/0705.0584', 'status': 'CLOSED'}",Multidimensional continued fractions and a Minkowski function,[Giovanni Panti],"{'text': ' Preliminaries The nth order Farey set F n in the real unit interval [0, 1] is defined by recursion: one starts with F 0 = {0/1, 1/1} and obtains F n by adding to F n−1 all the Farey sums v 1 ⊕ v 2 = (a 1 + a 2 )/(b 1 + b 2 ) of two consecutive elements v i = a i /b i of F n−1 . The union of all the F n 's is the set of all rational numbers in [0, 1]. Analogously, by starting with B 0 = F 0 and replacing the Farey sum with the barycentric sum v 1 v 2 = (v 1 + v 2 )/2, we obtain an...","{'text': 'as an index of the singularity of Φ at p.As we already observed, λ Φ[∆ ā t ] = 2 −t .By the Shannon-McMillan-Breiman Theorem[3, §13]we have, for µ-all p (and hence for λ-all p, since µ and λ have the same nullsets), thatwhere h µ is the metrical entropy of M w.r.t.µ.Without loss of generality, we can assume that p is in the topological interior of ∆.For such a p, there exist t 0 and a constantTaking logarithms in ( * * ) we haveFor n = 2 we have h µ ∼ 0.54807 . . .and, as shown in ..."
3,18615301,"{'disclaimer': 'This content is derived from https://arxiv.org/abs/chao-dyn/9808003. ', 'externalids': {'medline': None, 'mag': '3102114047', 'acl': None, 'doi': '10.1021/jp9821836', 'medrxiv': None, 'pubmedcentral': None, 'arxiv': 'chao-dyn/9808003'}, 'license': None, 'url': 'https://arxiv.org/abs/chao-dyn/9808003', 'status': 'GREEN'}",SPECTRAL AUTOCORRELATION FUNCTION IN WEAKLY OPEN CHAOTIC SYSTEMS : INDIRECT PHOTODISSOCIATION OF MOLECULES,"[Y. Alhassid, Y. Fyodorov]","{'text': ' Quantum systems that are classically chaotic are believed to exhibit statistical fluctuations in their spectra and wavefunctions that are universal. 1 These universal properties are well-reproduced by the assumption that the Hamiltonians belong to an ensemble of Hamiltonians that are consistent with the underlying symmetries, but are otherwise random. Such random Hamiltonians are described by random matrix theory (RMT), 2,3 and lead to level repulsion, long range correlations in t...","{'text': ' . * On leave from Petersburg Nuclear Physics Institute. 188350 O Bohigas, Chaos and Quantum Physics, Les-Houches Session LII. M J Giannoni, North Holland, Amsterdam199191 . T A Brody, Rev. Mod. Phys. 533851981 C E Porter, Statistical Theory of Spectra: Fluctuations. New YorkAcademic Press1965 M L Mehta, Random Matrices. New YorkAcademic Press19912nd ed. . Th, H Zimmermann, L S Koppel, G Cederbaum, W Persch, Demtroder, Phys. Rev. Lett. 6131988 . D M Leitner, H Koppel, L S Ced..."
4,255895487,"{'disclaimer': 'This content is derived from https://doi.org/10.3390/en16020804. Its open-access status is GOLD and license is CCBY.', 'externalids': {'medline': None, 'mag': None, 'acl': None, 'doi': '10.3390/en16020804', 'medrxiv': None, 'pubmedcentral': None, 'arxiv': None}, 'license': 'CCBY', 'url': 'https://doi.org/10.3390/en16020804', 'status': 'GOLD'}","Along-Strike Reservoir Development of Steep-Slope Depositional Systems: Case Study from Liushagang Formation in the Weixinan Sag, Beibuwan Basin, South China Sea","[Shengfa Liu, Hongtao Zhu, Qianghu Liu, Ziqiang Zhou, Jiahao Chen]","{'text': ' Introduction Since the concept was first proposed in the 1960s, the fan delta depositional system has received increasing attention. With the development of research, studying the fan delta has gradually deepened from the initial study of sediment characteristics and outcrops to the sedimentary model and fan delta reservoirs [1][2][3][4][5][6]. With the gradual deepening of studies on the fan delta depositional system, we found that the fan delta depositional system is widely dev...",{'text': 'Acknowledgments:We thank the CNOOC (Hainan) for the release of all the data.We thank the two anonymous for reviewing this manuscript and the editorial department for editorial handling and helpful comments.Data Availability Statement:The data that support the findings of this study are available on request from the corresponding authors.The data are not publicly available due to confidentiality restrictions.Funding: This work was supported by Major Outsourcing Projects of China Nat...


In [13]:
wr.athena.read_sql_query("""
SELECT
    corpusid AS id_semanticscholar,
    JSON_EXTRACT_SCALAR(CAST(openaccessinfo AS JSON), '$.externalids.mag') AS id_mag,
    JSON_EXTRACT_SCALAR(CAST(openaccessinfo AS JSON), '$.externalids.doi') AS id_doi,
    JSON_EXTRACT_SCALAR(CAST(openaccessinfo AS JSON), '$.externalids.arxiv') AS id_arxiv,
    title,
    JSON_EXTRACT_SCALAR(CAST(openaccessinfo AS JSON), '$.license') AS license,
    JSON_EXTRACT_SCALAR(CAST(openaccessinfo AS JSON), '$.url') AS source_url,
    JSON_EXTRACT_SCALAR(CAST(openaccessinfo AS JSON), '$.status') AS openaccess_status,
    JSON_EXTRACT_SCALAR(CAST(body AS JSON), '$.text') AS content_text,
    JSON_EXTRACT_SCALAR(CAST(body AS JSON), '$.annotations.paragraph') AS annotations_paragraph,
    JSON_EXTRACT_SCALAR(CAST(body AS JSON), '$.annotations.section_header') AS annotations_section_header
    --'x' AS "x"
 FROM 
     "01_raw".semanticscholar_s2orc_v2
 WHERE
     TRUE
 LIMIT 10
 """, '01_raw')

Unnamed: 0,id_semanticscholar,id_mag,id_doi,id_arxiv,title,license,source_url,openaccess_status,content_text,annotations_paragraph,annotations_section_header
0,234869482,3155033926.0,10.5772/INTECHOPEN.97307,,Thoracoabdominal Compartment Syndrome,CCBY,https://doi.org/10.5772/INTECHOPEN.97307,HYBRID,"\nIntroduction\n\nThe diaphragm is formed from a number of composite origins in the embryo. The most important is the ""septum transversum"", which is a thick mass of cranial mesenchyme that gives rise to parts of the thoracic diaphragm. Without dwelling into more details, the septum transversum merges with mesoderm surrounding the esophagus, the growing pleura and peritoneum ('pleuroperitoneal folds') and the growing muscles of the abdominal wall. The septum transversum gives rise to the cent...","[{""attributes"":null,""end"":612,""start"":15},{""attributes"":null,""end"":730,""start"":614},{""attributes"":null,""end"":999,""start"":732},{""attributes"":null,""end"":1550,""start"":1001},{""attributes"":null,""end"":1777,""start"":1552},{""attributes"":null,""end"":2191,""start"":1779},{""attributes"":null,""end"":2552,""start"":2193},{""attributes"":null,""end"":2851,""start"":2554},{""attributes"":null,""end"":3350,""start"":2880},{""attributes"":null,""end"":3706,""start"":3352},{""attributes"":null,""end"":3930,""start"":3708},{""attributes"":null...","[{""attributes"":{""n"":""1.""},""end"":13,""start"":1},{""attributes"":{""n"":""2.""},""end"":2878,""start"":2853},{""attributes"":{""n"":""3.""},""end"":5257,""start"":5201},{""attributes"":{""n"":""3.1""},""end"":5279,""start"":5259},{""attributes"":{""n"":""3.2""},""end"":6340,""start"":6286},{""attributes"":{""n"":""3.3""},""end"":9066,""start"":9029},{""attributes"":{""n"":""4.""},""end"":10473,""start"":10463}]"
1,270297560,,10.3390/dj12060170,,Periodontal Regeneration of Vital Poor Prognosis Teeth with Attachment Loss Involving the Root Apex: Two Cases with up to 5 Years Follow-Up,CCBY,https://pmc.ncbi.nlm.nih.gov/articles/PMC11202695,GOLD,"\nIntroduction\n\nPeriodontal regeneration is the ultimate goal of periodontal therapy and is defined as the ""restoration of lost or diminished periodontal tissues including cementum, periodontal ligament, and alveolar bone"" [1].Guided tissue regeneration (GTR), on the other hand, refers to a surgical procedure seeking to obtain the objectives of periodontal regeneration through utilising barrier devices or membranes, to exclude epithelial cells and provide space maintenance [1].The clinical...","[{""attributes"":null,""end"":1005,""start"":15},{""attributes"":null,""end"":3444,""start"":1007},{""attributes"":null,""end"":4719,""start"":3469},{""attributes"":null,""end"":5712,""start"":4721},{""attributes"":null,""end"":6359,""start"":5722},{""attributes"":null,""end"":9027,""start"":6369},{""attributes"":null,""end"":9648,""start"":9037},{""attributes"":null,""end"":10673,""start"":9650},{""attributes"":null,""end"":12286,""start"":10683},{""attributes"":null,""end"":12578,""start"":12297},{""attributes"":null,""end"":13477,""start"":12589},{""attr...","[{""attributes"":{""n"":""1.""},""end"":13,""start"":1},{""attributes"":{""n"":""2.""},""end"":3467,""start"":3446},{""attributes"":{""n"":""2.1.""},""end"":5720,""start"":5714},{""attributes"":{""n"":""2.1.""},""end"":6367,""start"":6361},{""attributes"":{""n"":""2.2.""},""end"":9035,""start"":9029},{""attributes"":{""n"":""2.2.""},""end"":10681,""start"":10675},{""attributes"":{""n"":""3.""},""end"":12295,""start"":12288},{""attributes"":{""n"":""3.""},""end"":12587,""start"":12580},{""attributes"":{""n"":""4.""},""end"":13489,""start"":13479},{""attributes"":{""n"":""4.""},""end"":142..."
2,251953573,,10.1103/PhysRevD.108.094029,2208.14827,Entanglement renormalization of the class of continuous matrix product states,CCBY,https://arxiv.org/abs/2208.14827,HYBRID,"\nTensor Network states are the entanglement-based ansatz that has arisen in recent years based on the renormalization group (RG) ideas and later on developed using tools and concepts from quantum information theory.\n\nThe main examples include matrix product states (MPS) [1], projected entangled-pair states (PEPS) [2], and multiscale entanglement renormalization ansatz (MERA) [3].By construction, they obey the entropy/area law [4][5][6][7] and are able to encode both global and local symme...","[{""attributes"":null,""end"":215,""start"":1},{""attributes"":null,""end"":1158,""start"":217},{""attributes"":null,""end"":2915,""start"":1160},{""attributes"":null,""end"":3320,""start"":2917},{""attributes"":null,""end"":4885,""start"":3322},{""attributes"":null,""end"":5277,""start"":4887},{""attributes"":null,""end"":6253,""start"":5279},{""attributes"":null,""end"":6883,""start"":6255},{""attributes"":null,""end"":7469,""start"":6885},{""attributes"":null,""end"":7822,""start"":7471},{""attributes"":null,""end"":7829,""start"":7824},{""attributes"":nu...","[{""attributes"":null,""end"":8125,""start"":8108},{""attributes"":null,""end"":10740,""start"":10703}]"
3,266900080,,,2401.04353,Counting and metrology of distributed atomic clocks using metropolitan fiber,,https://arxiv.org/abs/2401.04353,,"\nIntroduction.\n\nIn the past twenty years, frequency dissemination has been studied extensively and deeply because of its increasingly demand in many different applications, and the current optical frequency transfer with fiber link can achieve the stability of E-21.With the development of quantum frequency standards such as hydrogen maser and fountain clock, the short-term stability of radio frequency has reached 5E-14@1s and the most advanced optical frequency clock such as lattice clock...","[{""attributes"":null,""end"":879,""start"":16},{""attributes"":null,""end"":1985,""start"":881},{""attributes"":null,""end"":2897,""start"":1987},{""attributes"":null,""end"":3963,""start"":2899},{""attributes"":null,""end"":4872,""start"":3965},{""attributes"":null,""end"":8736,""start"":4899},{""attributes"":null,""end"":10195,""start"":8802},{""attributes"":null,""end"":12353,""start"":10241},{""attributes"":null,""end"":13113,""start"":12355}]","[{""attributes"":null,""end"":14,""start"":1},{""attributes"":null,""end"":4897,""start"":4874},{""attributes"":null,""end"":8800,""start"":8738},{""attributes"":null,""end"":10239,""start"":10197}]"
4,267561748,,10.1051/e3sconf/202448605009,,Production of organomineral fertilizers based on local raw materials and nitrogen-fixing microorganisms,CCBY,https://doi.org/10.1051/e3sconf/202448605009,GOLD,\nIntroduction\n\nDue to the rapid increase in the number of people in the world. providing them with sufficient food products is one of the important problems. Adequate use of organomineral fertilizers (OMF) through nitrogen-fixing microorganisms is necessary to solve this problem. By creating an optimal nutrient environment for nitrogen-fixing microorganisms. it is important to research in the direction of achieving the process of maximum nitrogen fixation in the air and creating a flexibl...,"[{""attributes"":null,""end"":567,""start"":15},{""attributes"":null,""end"":1192,""start"":569},{""attributes"":null,""end"":1317,""start"":1194},{""attributes"":null,""end"":1681,""start"":1319},{""attributes"":null,""end"":1915,""start"":1683},{""attributes"":null,""end"":3053,""start"":1917},{""attributes"":null,""end"":3380,""start"":3055},{""attributes"":null,""end"":3655,""start"":3382},{""attributes"":null,""end"":4162,""start"":3657},{""attributes"":null,""end"":4752,""start"":4164},{""attributes"":null,""end"":5287,""start"":4777},{""attributes"":n...","[{""attributes"":{""n"":""1""},""end"":13,""start"":1},{""attributes"":{""n"":""2""},""end"":4775,""start"":4754},{""attributes"":{""n"":""3""},""end"":8230,""start"":8208},{""attributes"":{""n"":""4""},""end"":13197,""start"":13187}]"
5,208033870,2988333020.0,10.1182/bloodadvances.2019000700,,Larotrectinib in TRK fusion-positive pediatric B-cell acute lymphoblastic leukemia.,CCBYNCND,https://doi.org/10.1182/bloodadvances.2019000700,GOLD,"\nIntroduction\n\nRearrangements involving neurotrophic receptor tyrosine kinase (NTRK) genes can generate fusion oncoproteins driving tumor development and survival. 1 NTRK gene fusions have been identified across a range of adult and pediatric solid malignancies. 2 B-cell acute lymphoblastic leukemia (ALL) can harbor an ETV6-NTRK3 gene fusion in ;1% of the so-called ""Philadelphia-like"" cases. 3 ETV6-NTRK3 fusion-positive B-cell ALL is characterized by rapid proliferation and infiltration o...","[{""attributes"":null,""end"":1462,""start"":15},{""attributes"":null,""end"":5737,""start"":1482},{""attributes"":null,""end"":6021,""start"":5748},{""attributes"":null,""end"":8020,""start"":6047},{""attributes"":null,""end"":8031,""start"":8024},{""attributes"":null,""end"":8037,""start"":8035}]","[{""attributes"":null,""end"":13,""start"":1},{""attributes"":null,""end"":1480,""start"":1464},{""attributes"":null,""end"":5746,""start"":5739},{""attributes"":null,""end"":6045,""start"":6023}]"
6,259077538,,10.3389/fphar.2023.1195490,,Curcumin supplementation increases longevity and antioxidant capacity in Caenorhabditis elegans,CCBY,https://pmc.ncbi.nlm.nih.gov/articles/PMC10279890,GOLD,"\nIntroduction\n\nAging and age-related disorders have emerged as significant medical and social concerns. Extensive research has indicated that the aging process is influenced by a complex interplay of various factors, including genetic and epigenetic interactions (Campisi et al., 2019;López-Otín et al., 2023). Aging is characterized by a decrease in various physiological activities of the organism and is accompanied by changes in the internal and external environment of the organism. Senes...","[{""attributes"":null,""end"":1574,""start"":15},{""attributes"":null,""end"":3220,""start"":1576},{""attributes"":null,""end"":4963,""start"":3222},{""attributes"":null,""end"":5749,""start"":4965},{""attributes"":null,""end"":6525,""start"":5818},{""attributes"":null,""end"":7327,""start"":6549},{""attributes"":null,""end"":7518,""start"":7361},{""attributes"":null,""end"":7847,""start"":7520},{""attributes"":null,""end"":8138,""start"":7849},{""attributes"":null,""end"":8472,""start"":8160},{""attributes"":null,""end"":8996,""start"":8552},{""attributes""...","[{""attributes"":{""n"":""1""},""end"":13,""start"":1},{""attributes"":{""n"":""2""},""end"":5772,""start"":5751},{""attributes"":{""n"":""2.1""},""end"":5816,""start"":5774},{""attributes"":{""n"":""2.2""},""end"":6547,""start"":6527},{""attributes"":{""n"":""2.3""},""end"":7359,""start"":7329},{""attributes"":{""n"":""2.4""},""end"":8158,""start"":8140},{""attributes"":{""n"":""2.5""},""end"":8550,""start"":8474},{""attributes"":{""n"":""2.6""},""end"":9027,""start"":8998},{""attributes"":{""n"":""2.6.1""},""end"":9718,""start"":9698},{""attributes"":{""n"":""2.7""},""end"":11665,""star..."
7,36121092,2580988987.0,10.1080/1369183X.2017.1320940,,Socioeconomic success of Asian immigrants in the United States,CCBY,https://doi.org/10.1080/1369183X.2017.1320940,HYBRID,"\nIntroduction\n\nIn recent years, there has been a significant increase in immigration from Asian countries to the United States. Between 2000 and 2015, Asian immigrant populations in the United States grew faster than any other migrant group (Zong and Batalova 2016). This increase has stimulated a growing number of researchers to focus on Asian immigrants although research on Latino migrant populations is still dominating the field of migration studies in the United States, especially when...","[{""attributes"":null,""end"":535,""start"":15},{""attributes"":null,""end"":1918,""start"":537},{""attributes"":null,""end"":2589,""start"":1920},{""attributes"":null,""end"":3020,""start"":2591},{""attributes"":null,""end"":4107,""start"":3041},{""attributes"":null,""end"":5742,""start"":4109},{""attributes"":null,""end"":9034,""start"":5744},{""attributes"":null,""end"":10355,""start"":9036},{""attributes"":null,""end"":10834,""start"":10398},{""attributes"":null,""end"":11729,""start"":10861},{""attributes"":null,""end"":12685,""start"":11743},{""attrib...","[{""attributes"":null,""end"":13,""start"":1},{""attributes"":null,""end"":3039,""start"":3022},{""attributes"":null,""end"":10396,""start"":10357},{""attributes"":null,""end"":10859,""start"":10836},{""attributes"":null,""end"":11741,""start"":11731},{""attributes"":null,""end"":12912,""start"":12891},{""attributes"":null,""end"":13190,""start"":13180},{""attributes"":null,""end"":14560,""start"":14526},{""attributes"":null,""end"":15183,""start"":15123},{""attributes"":null,""end"":16637,""start"":16614},{""attributes"":null,""end"":16659,""start"":16639..."
8,260400257,,10.3389/fendo.2023.1233685,,Duration of oestrogen exposure does not affect reproductive outcome in artificial cycles: a retrospective analysis of more than 7000 hormonal replacement therapy cycles for an embryo transfer,CCBY,https://pmc.ncbi.nlm.nih.gov/articles/PMC10430777,GOLD,"\nIntroduction\n\nArtificial endometrial preparation with hormonal replacement therapy (HRT) is frequently used for frozen embryo transfer (FET) and egg donation cycles (1). This protocol involves the administration of exogenous estrogen and progesterone trying to mimic the hormonal changes happening physiologically in a natural cycle. \n\nWhereas the number of days of progesterone administration until the embryo transfer (ET) is clearly defined according to the embryo development stage, the...","[{""attributes"":null,""end"":335,""start"":15},{""attributes"":null,""end"":1096,""start"":337},{""attributes"":null,""end"":1728,""start"":1098},{""attributes"":null,""end"":2408,""start"":1730},{""attributes"":null,""end"":2695,""start"":2410},{""attributes"":null,""end"":3114,""start"":2697},{""attributes"":null,""end"":3460,""start"":3116},{""attributes"":null,""end"":4055,""start"":3462},{""attributes"":null,""end"":4298,""start"":4100},{""attributes"":null,""end"":4858,""start"":4318},{""attributes"":null,""end"":5227,""start"":4871},{""attributes"":n...","[{""attributes"":{""n"":""1""},""end"":13,""start"":1},{""attributes"":{""n"":""2""},""end"":4078,""start"":4057},{""attributes"":{""n"":""2.1""},""end"":4098,""start"":4080},{""attributes"":{""n"":""2.1.1""},""end"":4316,""start"":4300},{""attributes"":{""n"":""2.1.2""},""end"":4869,""start"":4860},{""attributes"":{""n"":""2.2""},""end"":6852,""start"":6808},{""attributes"":{""n"":""2.2.2""},""end"":8396,""start"":8382},{""attributes"":{""n"":""2.3""},""end"":9261,""start"":9241},{""attributes"":{""n"":""3""},""end"":10434,""start"":10427},{""attributes"":{""n"":""3.1""},""end"":10456,""..."
9,15956973,2469794930.0,10.1038/aps.2015.166,,Protrusion-localized STAT3 mRNA promotes metastasis of highly metastatic hepatocellular carcinoma cells in vitro,,https://pmc.ncbi.nlm.nih.gov/articles/PMC4954761,BRONZE,"\nIntroduction\n\nHepatocellular carcinoma (HCC) is the fourth most common malignant cancer in the world. Approximately 250 000 people worldwide die of HCC each year [1,2] . More than 60% of patients who suffer from HCC are not diagnosed at an early stage because of the unclear pathogenesis of the disease, a hallmark of HCC. A delayed diagnosis of HCC results in only a 5% survival rate after 5 years [3] . Most cancer-associated mortality is related to the metastasis of malignant cancer cells...","[{""attributes"":null,""end"":1868,""start"":15},{""attributes"":null,""end"":2771,""start"":1870},{""attributes"":null,""end"":3725,""start"":2773},{""attributes"":null,""end"":4910,""start"":3750},{""attributes"":null,""end"":5950,""start"":4912},{""attributes"":null,""end"":7272,""start"":6002},{""attributes"":null,""end"":8683,""start"":7274},{""attributes"":null,""end"":9806,""start"":8685},{""attributes"":null,""end"":10110,""start"":9830},{""attributes"":null,""end"":11872,""start"":10193},{""attributes"":null,""end"":13985,""start"":11874},{""attrib...","[{""attributes"":null,""end"":13,""start"":1},{""attributes"":null,""end"":3748,""start"":3727},{""attributes"":null,""end"":6000,""start"":5952},{""attributes"":null,""end"":9828,""start"":9808},{""attributes"":null,""end"":10119,""start"":10112},{""attributes"":null,""end"":10191,""start"":10121},{""attributes"":null,""end"":14687,""start"":14637},{""attributes"":null,""end"":16126,""start"":16045},{""attributes"":null,""end"":19414,""start"":19404}]"


In [9]:
utils.pd_set_options()
wr.athena.read_sql_query("""
WITH
raw_semanticscholar_s2orcv2 AS 
(
    SELECT * FROM "01_raw".semanticscholar_s2orc_v2
),
base_semanticscholar_s2orcv2_step01 AS 
(
    SELECT
        corpusid AS id_semanticscholar,
        JSON_EXTRACT_SCALAR(CAST(openaccessinfo AS JSON), '$.externalids.mag') AS id_mag,
        JSON_EXTRACT_SCALAR(CAST(openaccessinfo AS JSON), '$.externalids.doi') AS id_doi,
        JSON_EXTRACT_SCALAR(CAST(openaccessinfo AS JSON), '$.externalids.arxiv') AS id_arxiv,
        title,
        JSON_EXTRACT_SCALAR(CAST(openaccessinfo AS JSON), '$.license') AS original_license,
        JSON_EXTRACT_SCALAR(CAST(openaccessinfo AS JSON), '$.url') AS source_url,
        JSON_EXTRACT_SCALAR(CAST(openaccessinfo AS JSON), '$.status') AS openaccess_status,
        JSON_EXTRACT_SCALAR(CAST(body AS JSON), '$.text') AS content_text,
        JSON_EXTRACT_SCALAR(CAST(body AS JSON), '$.annotations.paragraph') AS annotations_paragraph,
        JSON_EXTRACT_SCALAR(CAST(body AS JSON), '$.annotations.section_header') AS annotations_section_header
     FROM 
         raw_semanticscholar_s2orcv2
),
base_semanticscholar_s2orcv2 AS (
    SELECT
        id_semanticscholar,
        id_mag,
        id_doi,
        id_arxiv,
        title,
        source_url,
        openaccess_status,
        content_text,
        annotations_paragraph,
        annotations_section_header,
        CASE
            -- These licenses might or might not be usable just because they are openly accessible, so we are assuming they are not
            WHEN 
                original_license IS NULL OR
                original_license IN(
                    'acs-specific: authorchoice/editors choice usage agreement',
                    'elsevier-specific: oa user license',
                    'other-oa',
                    'publisher-specific, author manuscript: http://academic.oup.com/journals/pages/about_us/legal/notices',
                    'publisher-specific-oa',
                    'unspecified-oa',
                    'Open Government Licence - Canada',
                    'publisher-specific, author manuscript',
                    'implied-oa',
                    'publisher-specific, author manuscript: http://rsc.li/journals-terms-of-use',
                    'publisher-specific, author manuscript: http://onlinelibrary.wiley.com/termsAndConditions',
                    'publisher-specific, author manuscript: http://onlinelibrary.wiley.com/termsAndConditions#am',
                    'publisher-specific license'
                ) 
            THEN 'unknown-reusability'
            -- A few public domain licenses were abbreviated
            WHEN original_license = 'pd' THEN 'public-domain'
            ELSE original_license
        END AS license
     FROM 
         base_semanticscholar_s2orcv2_step01
)
SELECT * FROM base_semanticscholar_s2orcv2 
LIMIT 5
""", '01_raw')

Unnamed: 0,license
0,CCBYND
1,unknown-reusability
2,CCBYNCSA
3,CCBYNC
4,public-domain
5,gpl
6,CC0
7,CCBY
8,CCBYSA
9,CCBYNCND


In [10]:
utils.pd_set_options(cols=500)

wr.athena.read_sql_query("""
SELECT
    license,
    COUNT(*) AS c
 FROM 
     "02_stg".base_semanticscholar_s2orcv2
 GROUP BY
     license
 """, '02_stg')

Unnamed: 0,license,c
0,CCBYND,18510
1,gpl,11
2,public-domain,22963
3,CCBYNC,1131261
4,CCBYNCSA,333540
5,CCBY,6487518
6,CCBYSA,127094
7,mit,5443
8,unknown-reusability,2526433
9,CCBYNCND,945585


In [10]:
utils.pd_set_options(cols=500)

wr.athena.read_sql_query("""
WITH
base_semanticscholar_s2orcv2_ AS (
    SELECT * FROM "02_stg".base_semanticscholar_s2orcv2
),
raw_semanticscholar_papers AS (
    SELECT * FROM "01_raw".semanticscholar_papers
),
raw_semanticscholar_abstracts AS (
    SELECT * FROM "01_raw".semanticscholar_abstracts
),
semanticscholar_joined AS (
    SELECT
        base_semanticscholar_s2orcv2_.id_semanticscholar,
        base_semanticscholar_s2orcv2_.id_mag,
        base_semanticscholar_s2orcv2_.id_doi,
        base_semanticscholar_s2orcv2_.id_arxiv,
        base_semanticscholar_s2orcv2_.title,
        base_semanticscholar_s2orcv2_.source_url,
        base_semanticscholar_s2orcv2_.openaccess_status,
        base_semanticscholar_s2orcv2_.content_text,
        base_semanticscholar_s2orcv2_.annotations_paragraph,
        base_semanticscholar_s2orcv2_.annotations_section_header,
        base_semanticscholar_s2orcv2_.license,

        CASE WHEN raw_semanticscholar_abstracts.corpusid IS NULL THEN 0 ELSE 1 END AS abstracts_join_worked,
        raw_semanticscholar_abstracts.abstract AS content_abstract,

        CASE WHEN raw_semanticscholar_papers.corpusid IS NULL THEN 0 ELSE 1 END AS papers_join_worked,
        raw_semanticscholar_papers.year AS publication_year,
        raw_semanticscholar_papers.publicationdate AS publication_date
    FROM
        base_semanticscholar_s2orcv2_
    LEFT JOIN
        raw_semanticscholar_papers
    ON
        base_semanticscholar_s2orcv2_.id_semanticscholar = raw_semanticscholar_papers.corpusid
    LEFT JOIN
        raw_semanticscholar_abstracts
    ON
        base_semanticscholar_s2orcv2_.id_semanticscholar = raw_semanticscholar_abstracts.corpusid
),
count_join_to_abstracts AS (
SELECT
    'abstracts' AS join_type,
    abstracts_join_worked AS join_worked,
    COUNT(*) AS c
FROM
    semanticscholar_joined
GROUP BY
    abstracts_join_worked
),
count_join_to_papers AS (
SELECT
    'papers' AS join_type,
    papers_join_worked AS join_worked,
    COUNT(*) AS c
FROM
    semanticscholar_joined
GROUP BY
    papers_join_worked
),
count_s2orcv2 AS (
SELECT
    'original' AS join_type,
    CAST(NULL AS INT) AS join_worked,
    COUNT(*) AS c
FROM
    base_semanticscholar_s2orcv2_
)
SELECT * FROM count_join_to_abstracts UNION ALL
SELECT * FROM count_join_to_papers UNION ALL
SELECT * FROM count_s2orcv2
 """, '02_stg')

Unnamed: 0,join_type,join_worked,c
0,original,,11609787
1,abstracts,1.0,10910456
2,abstracts,0.0,699331
3,papers,1.0,11609787


In [19]:
utils.pd_set_options(cols=500)

wr.athena.read_sql_query("""
WITH
base_semanticscholar_s2orcv2_ AS (
SELECT
    *
FROM 
     "02_stg".base_semanticscholar_s2orcv2
),
flags AS (
SELECT 
    CASE WHEN id_semanticscholar IS NULL THEN 0 ELSE 1 END AS has_id_semanticscholar,
    CASE WHEN id_mag IS NULL THEN 0 ELSE 1 END AS has_id_mag,
    CASE WHEN id_doi IS NULL THEN 0 ELSE 1 END AS has_id_doi,
    CASE WHEN id_arxiv IS NULL THEN 0 ELSE 1 END AS has_id_arxiv--,
    --*
FROM
    base_semanticscholar_s2orcv2_
)
SELECT has_id_arxiv, COUNT(*)/1000000.0 AS c FROM flags GROUP BY has_id_arxiv
 """, '02_stg')

Unnamed: 0,has_id_arxiv,c
0,1,2.60051
1,0,9.009277


In [8]:
utils.pd_set_options(cols=500)

wr.athena.read_sql_query("""
WITH
base_semanticscholar_s2orcv2_ AS (
    SELECT * FROM "02_stg".base_semanticscholar_s2orcv2
),
base_arxiv_metadata_ AS (
    SELECT * FROM "02_stg".base_arxiv_metadata
),
join_on_arxiv_id AS (
    SELECT
        base_semanticscholar_s2orcv2_.id_semanticscholar AS semanticscholar_id_semanticscholar,
        base_semanticscholar_s2orcv2_.id_arxiv AS semanticscholar_id_arxiv,
        base_semanticscholar_s2orcv2_.id_doi AS semanticscholar_id_doi,
        base_arxiv_metadata_.id_arxiv AS arxiv_id_arxiv,
        base_arxiv_metadata_.id_doi AS arxiv_id_doi,
        CASE WHEN base_arxiv_metadata_.id_arxiv IS NULL THEN 0 ELSE 1 END AS join_worked
    FROM
        base_semanticscholar_s2orcv2_
    LEFT JOIN
        base_arxiv_metadata_
    ON
        base_semanticscholar_s2orcv2_.id_arxiv = base_arxiv_metadata_.id_arxiv
),
count_join_on_arxiv_id AS (
SELECT
    'arxiv' AS join_type,
    COUNT(*) AS c
FROM
    join_on_arxiv_id
WHERE
    join_worked = 1
GROUP BY
    join_worked
),
join_on_doi_id AS (
    SELECT
        base_semanticscholar_s2orcv2_.id_semanticscholar AS semanticscholar_id_semanticscholar,
        base_semanticscholar_s2orcv2_.id_arxiv AS semanticscholar_id_arxiv,
        base_semanticscholar_s2orcv2_.id_doi AS semanticscholar_id_doi,
        base_arxiv_metadata_.id_arxiv AS arxiv_id_arxiv,
        base_arxiv_metadata_.id_doi AS arxiv_id_doi,
        CASE WHEN base_arxiv_metadata_.id_doi IS NULL THEN 0 ELSE 1 END AS join_worked
    FROM
        base_semanticscholar_s2orcv2_
    LEFT JOIN
        base_arxiv_metadata_
    ON
        base_semanticscholar_s2orcv2_.id_doi = base_arxiv_metadata_.id_doi
),
count_join_on_doi_id AS (
SELECT
    'doi' AS join_type,
    COUNT(*) AS c
FROM
    join_on_doi_id
WHERE
    join_worked = 1
GROUP BY
    join_worked
),
count_arxiv AS (
SELECT
    'original' AS join_type,
    COUNT(*) AS c
FROM
    base_arxiv_metadata_
)
SELECT * FROM count_join_on_arxiv_id UNION ALL
SELECT * FROM count_join_on_doi_id UNION ALL
SELECT * FROM count_arxiv
 """, '02_stg')

Unnamed: 0,join_type,c
0,original,2816721
1,arxiv,2600510
2,doi,1166552


In [7]:
utils.pd_set_options(cols=500)

wr.athena.read_sql_query("""
WITH
base_semanticscholar_s2orcv2_ AS (
    SELECT * FROM "02_stg".base_semanticscholar_s2orcv2
),
base_arxiv_metadata_ AS (
    SELECT * FROM "02_stg".base_arxiv_metadata
),
join_on_arxiv_id AS (
    SELECT
        base_semanticscholar_s2orcv2_.id_semanticscholar,
        base_semanticscholar_s2orcv2_.id_mag,
        base_semanticscholar_s2orcv2_.id_doi,
        base_semanticscholar_s2orcv2_.id_arxiv,
        base_semanticscholar_s2orcv2_.title,
        base_semanticscholar_s2orcv2_.source_url,
        base_semanticscholar_s2orcv2_.openaccess_status,
        base_semanticscholar_s2orcv2_.content_text,
        base_semanticscholar_s2orcv2_.annotations_paragraph,
        base_semanticscholar_s2orcv2_.annotations_section_header,
        base_semanticscholar_s2orcv2_.license
        
        base_arxiv_metadata_.license AS arxiv_license
    FROM
        base_semanticscholar_s2orcv2_
    LEFT JOIN
        base_arxiv_metadata_
    ON
        base_semanticscholar_s2orcv2_.id_arxiv = base_arxiv_metadata_.id_arxiv
),
count_join_on_arxiv_id AS (
SELECT
    'arxiv' AS join_type,
    COUNT(*) AS c
FROM
    join_on_arxiv_id
WHERE
    join_worked = 1
GROUP BY
    join_worked
),
join_on_doi_id AS (
    SELECT
        base_semanticscholar_s2orcv2_.id_semanticscholar AS semanticscholar_id_semanticscholar,
        base_semanticscholar_s2orcv2_.id_arxiv AS semanticscholar_id_arxiv,
        base_semanticscholar_s2orcv2_.id_doi AS semanticscholar_id_doi,
        base_arxiv_metadata_.id_arxiv AS arxiv_id_arxiv,
        base_arxiv_metadata_.id_doi AS arxiv_id_doi,
        CASE WHEN base_arxiv_metadata_.id_doi IS NULL THEN 0 ELSE 1 END AS join_worked
    FROM
        base_semanticscholar_s2orcv2_
    LEFT JOIN
        base_arxiv_metadata_
    ON
        base_semanticscholar_s2orcv2_.id_doi = base_arxiv_metadata_.id_doi
),
count_join_on_doi_id AS (
SELECT
    'doi' AS join_type,
    COUNT(*) AS c
FROM
    join_on_doi_id
WHERE
    join_worked = 1
GROUP BY
    join_worked
),
count_arxiv AS (
SELECT
    'original' AS join_type,
    COUNT(*) AS c
FROM
    base_arxiv_metadata_
)
SELECT * FROM count_join_on_arxiv_id UNION ALL
SELECT * FROM count_join_on_doi_id UNION ALL
SELECT * FROM count_arxiv
 """, '02_stg')

Unnamed: 0,id_arxiv,id_doi,title,abstract,license
0,astro-ph/0110109,,XMM-Newton First Observation in the Pleiades,"We present the first results from a 40 ks Guaranteed Time XMM-Newton pointing in the Pleiades. We detect almost all early-mid dM members in the field and several very low mass (VLM) stars - including the brown dwarf (BD) candidate Roque 9 - and investigate the variation of X-ray activity levels, hardness ratios and flare frequency with spectral type down to the BD regime.",
1,astro-ph/0110111,,Detection of spiral structure of the quiescent accretion disk of IP Pegasi,"We present the results of the spectral investigations of IP Peg in quiescence. Optical spectra obtained on 6-m telescope at the Special Astrophysical Observatory (Russia), and on the 3.5-m telescope at the German-Spanish Astronomical Center (Calar Alto, Spain), have been analysed by means of Doppler tomography and phase modeling technique. This analysis has allowed us to make conclusions, that the quiescent accretion disk of IP Peg has a complicated structure. Equally with the bright spot ...",
2,astro-ph/0110114,,The Be/X-ray Binary LSI+61303 in terms of Ejector-Propeller Model,"We tested the ejector-propeller model of the Be/X-ray binary LSI+61303 (V 615 Cas, GT 0236+620) by using the parameters predicted by the model in the calculations of the X-ray and radio variability. The results are: (1) in terms of the Ejector-Propeller model, the X-ray maximum is due to the periastron passage; (2) the radio outburst can be really a result of the transition from the propeller to ejector regimes; (3) the radio outburst will delay with respect to the X-ray maximum every orbi...",
3,astro-ph/0110119,,The TexOx Survey of Radio-Selected Galaxy Clusters,"We present some initial results from the TexOx (Texas-Oxford) Cluster (TOC) survey - a new method of selecting distant galaxy clusters. The cosmic evolution of the radio source population suggests that some massive clusters at high redshift will contain several radio-loud AGN. We searched for extreme over-densities at ~mJy levels in 7' x 7' boxes within the NVSS radio catalogue, covering a large (~1100 square degree) sky area. We have acquired optical images for ~130 cluster candidates, an...",
4,astro-ph/0110120,,The z ~ 1.2 Galaxy Luminosity Function from The LCIR Survey,"We present results from the Las Campanas Infrared Survey, designed to identify a statistically significant sample of z>=1 galaxies using photometric redshift techniques. Here we summarize the design and strategies of the survey and present the first estimate of the galaxy luminosity function at z>=1 based on H-band selected galaxies identified in our survey. Results of number count studies and luminosity function measurements indicate that most early-type galaxies were already in place by ...",
5,astro-ph/0110121,,Low-frequency Carbon Recombination Lines towards HI self-absorption features in the Galactic Plane,A survey of radio recombination lines (RLs) in the Galactic plane ($l = $ 332\deg $\to$ 89\deg) near 327 MHz made using the Ooty Radio Telescope (ORT) has detected carbon RLs from all the positions in the longitude range 0\deg $< l < $ 20\deg and from a few positions at other longitudes. The carbon RLs detected in this survey originate from ``diffuse'' \CII regions. Comparison of the \lv diagram and the radial distribution of carbon line emission with those obtained from hydrogen RLs near ...,
6,astro-ph/0110127,,Polarization properties of the 6.7 GHz methanol masers in NGC6334F,"The Australia Telescope Compact Array (ATCA) has been used to make the first full polarization observations of 6.7 GHz methanol masers. Linear polarization was detected towards all four sources observed, at levels between a few and 10%, while none of the sources show circular polarization stronger than approximately 1.5%. Linear polarization appears to be more common in the 6.7 GHz methanol maser transition than it is for the 12.2 GHz transition, consistent with the hypothesis that the 6.7...",
7,astro-ph/0110131,,A search for high redshift clusters associated with radio galaxies at 2 < z < 4,"High redshift radio galaxies are amongst the most massive galaxies in the early Universe and have properties expected from central galaxies in forming clusters. We are carrying out an observational programme on the VLT to find and study galaxy proto clusters around radio galaxies at redshifts 2 < z < 4. First, we use narrow band imaging to select candidate galaxies which show excess Lyman alpha emission at redshifts similar to the central radio galaxy. Then, we use multi object spectroscop...",
8,astro-ph/0110134,,"Magnetically Induced ""Dry"" Water Like Structure of Charged Fluid at the Core of a Magnetar","It is shown that charged fluid, e.g., electron gas or proton matter at the core of a magnetar exhibit super-fluid (frictionless) like property if the magnetic field strength is high enough to populate only the zeroth Landau levels.",
9,astro-ph/0110137,,Negative skewness of radial pairwise velocity in the quasi-nonlinear regime: Zel'dovich approximation,"According to N-body numerical simulations, the radial pairwise velocities of galaxies have negative skewness in the quasi-nonlinear regime. To understand its origin, we calculate the probability distribution function of the radial pairwise velocity using the Zel'dovich approximation, i.e., an analytical approximation for gravitational clustering. The calculated probability distribution function is in good agreement with the result of N-body simulations. Thus the negative skewness originate...",
