In [1]:
import requests
import boto3
import json
import os
import sys
import argparse
import importlib
import transformers
import torch
import pathlib
import awswrangler as wr
from IPython.display import display
from sagemaker.huggingface.processing import HuggingFaceProcessor
from sagemaker.sklearn.processing import SKLearnProcessor
from sagemaker.processing import FrameworkProcessor
from sagemaker.sklearn.estimator import SKLearn
from sagemaker.workflow.steps import ProcessingStep
from sagemaker.workflow.pipeline_context import PipelineSession
from sagemaker.processing import ProcessingInput, ProcessingOutput
from sagemaker.session import get_execution_role


# Adding ../01_modules or ./01_modules to the system path so that we can load modules from 
# there as well
if '__file__' in globals():
    script_dir = pathlib.Path(__file__).parent.resolve()
else:
    script_dir = pathlib.Path().absolute()
modules_path_in_dev = os.path.abspath(os.path.join(script_dir, '..', '01_modules'))
modules_path_in_prod = os.path.abspath(os.path.join(script_dir, '01_modules'))
if os.path.exists(modules_path_in_dev):
    sys.path.append(modules_path_in_dev)
if os.path.exists(modules_path_in_prod):
    sys.path.append(modules_path_in_prod)


# # Jupyter only reads a local module the first time after 
# # kernel start. Re-running a cell with 
# # "from mymodulename import *" would not change
# # anything, even if the imported module has since changed.
# # As a workaround, we need to directly load the module, 
# # use importlib.reload to reload it and then import * 
import utils
_ = importlib.reload(utils)
import config
_ = importlib.reload(config) 


sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/sagemaker-user/.config/sagemaker/config.yaml
config.py loaded: v0.1
utils.py loaded: v0.2.12
utils.py loaded: v0.2.12
config.py loaded: v0.1


In [3]:
utils.pd_set_options()
wr.athena.read_sql_query("""
SELECT
    *
 FROM 
     "01_raw".arxiv_metadatada -- mind the typo
 WHERE
     TRUE
 LIMIT 5
 """, '01_raw')

Unnamed: 0,id,submitter,authors,title,comments,journal-ref,doi,abstract,report-no,license,categories,versions,versions_dates
0,hep-ph/0204230,Timo Arvid Lahde,"T.A. Lahde, D.O. Riska (Helsinki Institute of Physics and Department of Physics, University of Helsinki, Finland)",The Coupling of eta Mesons to Quarks and Baryons from D_s^* -> D_s pi^0 Decay,"17 pages, uses Feynmf. Submitted to Nuclear Physics A, accepted version",Nucl.Phys. A710 (2002) 99-116,10.1016/S0375-9474(02)01125-9,"The known ratio of the branching fractions for D_s^* --> D_s pi^0 and D_s^* --> D_s gamma may be used to extract the coupling of eta mesons to strange quarks once the value of the pi^0-eta mixing angle is known. This requires that realistic models for the spectra as well as the magnetic dipole (M1) decays of the heavy-light (Q qbar) mesons are available. The coupling of eta mesons to light quarks may then be estimated using SU(3) flavor symmetry. Applied to the quark model for the baryons, an eta NN pseudovector coupling constant of f_{eta NN} = 0.35^{+0.15}_{-0.25} is obtained. If the charm quark couples significantly to the eta meson, as is suggested by the decay mode psi' --> J/(psi eta), then somewhat larger values of f_{eta NN} can be obtained. These values are sufficiently small to be consistent with phenomenological analysis of photoproduction of the eta on the nucleon and the reaction pp --> pp eta.",,,[hep-ph],"[v1, v2]","[Fri, 19 Apr 2002 10:59:12 GMT, Wed, 07 Aug 2002 13:28:20 GMT]"
1,hep-ph/0204231,Steen Hannestad,"Steen Hannestad, Petteri Keranen, Francesco Sannino",A supernova constraint on bulk majorons,"Minor changes, matches the version to appear in PRD",Phys.Rev. D66 (2002) 045002,10.1103/PhysRevD.66.045002,"In models with large extra dimensions all gauge singlet fields can in principle propagate in the extra dimensional space. We have investigated possible constraints on majoron models of neutrino masses in which the majorons propagate in extra dimensions. It is found that astrophysical constraints from supernovae are many orders of magnitude stronger than previous accelerator bounds. Our findings suggest that unnatural types of the ""see-saw"" mechanism for neutrino masses are unlikely to occur in nature, even in the presence of extra dimensions.",,,[hep-ph astro-ph],"[v1, v2]","[Fri, 19 Apr 2002 12:24:40 GMT, Wed, 17 Jul 2002 10:11:33 GMT]"
2,hep-ph/0204234,Carl Shakin,"L.S. Celenza, Hu Li, C.M. Shakin, and Qing Sun",Quark and Nucleon Self-Energy in Dense Matter,"19 pages, 8 figures, 2 tables, revtex",Phys.Rev. D66 (2002) 054010,10.1103/PhysRevD.66.054010,"In a recent work we introduced a nonlocal version of the Nambu--Jona-Lasinio(NJL) model that was designed to generate a quark self-energy in Euclidean space that was similar to that obtained in lattice simulations of QCD. In the present work we carry out related calculations in Minkowski space, so that we can study the effects of the significant vector and axial-vector interactions that appear in extended NJL models and which play an important role in the study of the $\rho$, $\omega$ and $a_1$ mesons. We study the modification of the quark self-energy in the presence of matter and find that our model reproduces the behavior of the quark condensate predicted by the model-independent relation $<\bar qq>_{\rho} = <\bar qq>_0(1-\sigma_N\rho_N/f_{\pi}^2m_{\pi}^2 +...)$, where $\sigma_N$ is the pion-nucleon sigma term and $\rho_N$ is the density of nuclear matter. (Since we do not include a model of confinement, our study is restricted to the analysis of quark matter. We provide some ...",,,[hep-ph nucl-th],[v1],"[Fri, 19 Apr 2002 15:29:09 GMT]"
3,hep-ph/0204237,Tom Steele,Ailin Zhang and T.G. Steele,Decays of the $\hat\rho(1^{-+})$ Exotic Hybrid and $\eta$-$\eta'$ Mixing,"latex2e, 11 pages with 4 embedded eps figures. v 2 corrects reference [5] and minor error in equation (11)",Phys.Rev. D65 (2002) 114013,10.1103/PhysRevD.65.114013,"QCD sum-rules are used to calculate the $\hat\rho(1^{-+})\to\pi\eta, \pi\eta'$ decay widths of the exotic hybrid in two different $\eta-\eta'$ mixing schemes.\n In the conventional flavour octet-singlet mixing scheme, the decay widths are both found to be small, while in the recently-proposed quark mixing scheme, the decay width $\Gamma_{\hat\rho\to\eta\pi}\approx 250 MeV$ is large compared with the decay width $\Gamma_{\hat\rho\to\eta^\prime\pi}\approx 20 MeV$. These results provide some insight into $\eta$-$\eta'$ mixing and hybrid decay features.",,,[hep-ph],"[v1, v2]","[Fri, 19 Apr 2002 20:12:47 GMT, Thu, 29 Aug 2002 15:09:34 GMT]"
4,hep-ph/0204238,Ugur Erkarslan,Z. Z. Aydin and U. Erkarslan,The charm quark EDM and singlet P-wave charmonium production in supersymmetry,"10 pages, 10 figures",Phys.Rev. D67 (2003) 036006,10.1103/PhysRevD.67.036006,"We analyze the singlet $P$--wave charmonium production at $e^+ e^-$ colliders within the framework of unconstrained supersymmetry. We show that the CP--violating transitions, dominated by the gluino exchange, are typically four orders of magnitude larger than the CP--conserving ones, and former is generated by the electric dipole moment of the charm quark. Our results can be directly tested via the charmonium searches at the CLEO--c experiment.",,,[hep-ph],"[v1, v2, v3]","[Fri, 19 Apr 2002 21:49:48 GMT, Fri, 26 Apr 2002 07:27:04 GMT, Sun, 22 Dec 2002 11:57:44 GMT]"


In [4]:
wr.athena.read_sql_query("""
SELECT
    COUNT(*) AS c
 FROM 
     "01_raw".arxiv_metadatada -- mind the typo
 """, '01_raw')

Unnamed: 0,c
0,2816721


In [6]:
utils.pd_set_options()
wr.athena.read_sql_query("""
WITH
raw_arxiv_metadata AS 
(
    SELECT * FROM "01_raw".arxiv_metadatada -- mind the typo, fixed in downstream models
),
base_arxiv_metadata AS 
(
    SELECT
        id AS arxiv_id, 
        doi AS doi_id,
        title,
        abstract,
        license -- TODO: explore and extract s2orcv2 to determine its license keys and map it to here
    FROM
        raw_arxiv_metadata
)
SELECT license, COUNT(*) AS c FROM base_arxiv_metadata GROUP BY license
""", '01_raw')

Unnamed: 0,license,c
0,http://creativecommons.org/licenses/by-nc-sa/4.0/,52087
1,http://creativecommons.org/publicdomain/zero/1.0/,18092
2,http://creativecommons.org/licenses/by-nc-nd/4.0/,67702
3,http://creativecommons.org/licenses/by/4.0/,425906
4,http://creativecommons.org/licenses/by-sa/4.0/,23748
5,http://creativecommons.org/licenses/by/3.0/,7914
6,http://creativecommons.org/licenses/by-nc-sa/3.0/,5876
7,http://creativecommons.org/licenses/publicdomain/,2475
8,http://arxiv.org/licenses/nonexclusive-distrib/1.0/,1760164
9,,452757


In [5]:
utils.pd_set_options()
wr.athena.read_sql_query("""
WITH
raw_arxiv_metadata AS 
(
    SELECT * FROM "01_raw".arxiv_metadatada -- mind the typo, fixed in downstream models
),
base_arxiv_metadata AS 
(
    SELECT
        id AS arxiv_id, 
        doi AS doi_id,
        title,
        abstract,
        -- Converting license definitions to the same format as SemanticScholar
        CASE
            WHEN "license" IS NULL THEN NULL
            WHEN "license" = 'http://creativecommons.org/licenses/by-nc-sa/4.0/' THEN 'CCBYNCSA'
            WHEN "license" = 'http://creativecommons.org/publicdomain/zero/1.0/' THEN 'CC0'
            WHEN "license" = 'http://creativecommons.org/licenses/by-nc-nd/4.0/' THEN 'CCBYNCND'
            WHEN "license" = 'http://creativecommons.org/licenses/by/4.0/' THEN 'CCBY'
            WHEN "license" = 'http://creativecommons.org/licenses/by-sa/4.0/' THEN 'CCBYSA'
            WHEN "license" = 'http://creativecommons.org/licenses/by/3.0/' THEN 'CCBY'
            WHEN "license" = 'http://creativecommons.org/licenses/by-nc-sa/3.0/' THEN 'CCBYNCSA'
            WHEN "license" = 'http://creativecommons.org/licenses/publicdomain/' THEN 'public-domain'
            WHEN "license" = 'http://arxiv.org/licenses/nonexclusive-distrib/1.0/' THEN 'ArXiv nonexclusive-distrib'
            ELSE CONCAT('ArXiv: ', "license")
        END AS license
    FROM
        raw_arxiv_metadata
)
SELECT * FROM base_arxiv_metadata WHERE license IS NOT NULL AND license LIKE 'ArXiv:%' LIMIT 3
""", '01_raw')

Unnamed: 0,arxiv_id,doi_id,title,abstract,license
