In [1]:
import requests
import boto3
import json
import os
import sys
import argparse
import importlib
import transformers
import torch
import pathlib
import awswrangler as wr
from IPython.display import display
from sagemaker.huggingface.processing import HuggingFaceProcessor
from sagemaker.sklearn.processing import SKLearnProcessor
from sagemaker.processing import FrameworkProcessor
from sagemaker.sklearn.estimator import SKLearn
from sagemaker.workflow.steps import ProcessingStep
from sagemaker.workflow.pipeline_context import PipelineSession
from sagemaker.processing import ProcessingInput, ProcessingOutput
from sagemaker.session import get_execution_role


# Adding ../01_modules or ./01_modules to the system path so that we can load modules from 
# there as well
if '__file__' in globals():
    script_dir = pathlib.Path(__file__).parent.resolve()
else:
    script_dir = pathlib.Path().absolute()
modules_path_in_dev = os.path.abspath(os.path.join(script_dir, '..', '01_modules'))
modules_path_in_prod = os.path.abspath(os.path.join(script_dir, '01_modules'))
if os.path.exists(modules_path_in_dev):
    sys.path.append(modules_path_in_dev)
if os.path.exists(modules_path_in_prod):
    sys.path.append(modules_path_in_prod)


# # Jupyter only reads a local module the first time after 
# # kernel start. Re-running a cell with 
# # "from mymodulename import *" would not change
# # anything, even if the imported module has since changed.
# # As a workaround, we need to directly load the module, 
# # use importlib.reload to reload it and then import * 
import utils
_ = importlib.reload(utils)
import config
_ = importlib.reload(config) 


sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/sagemaker-user/.config/sagemaker/config.yaml
config.py loaded: v0.1
utils.py loaded: v0.2.12
utils.py loaded: v0.2.12
config.py loaded: v0.1


In [5]:
utils.pd_set_options(cols=500)

wr.athena.read_sql_query("""
SELECT
    *
 FROM 
     "01_raw".semanticscholar_s2orc_v2 -- v2 has an _ prefix, it is removed from dowstream models
 LIMIT 5
 """, '01_raw')

Unnamed: 0,corpusid,openaccessinfo,title,authors,body,bibliography
0,118396736,"{'disclaimer': 'This content is derived from https://arxiv.org/abs/1606.06824. ', 'externalids': {'medline': None, 'mag': '2953085941', 'acl': None, 'doi': '10.3847/2041-8205/825/2/L28', 'medrxiv': None, 'pubmedcentral': None, 'arxiv': '1606.06824'}, 'license': None, 'url': 'https://arxiv.org/abs/1606.06824', 'status': 'GREEN'}",PLANETESIMAL FORMATION BY GRAVITATIONAL INSTABILITY OF A POROUS DUST DISK,"[S. Michikoshi, E. Kokubo]","{'text': ' INTRODUCTION In the standard scenario of planet formation, planetesimals are the building blocks of planets (e.g., Safronov 1969;Hayashi et al. 1985). In a protoplanetary disk, small dust grains grow to kilometer-sized objects called planetesimals. From planetesimals, protoplanets, or planetary embryos, form through a process of runaway and oligarchic growth (e.g., Kokubo & Ida 1998. However, the formation mechanism of planetesimals is one of today's most important unsolved probl...","{'text': ' . I Adachi, C Hayashi, K Nakazawa, 10.1143/PTP.56.1756PThPh. 561756Adachi, I., Hayashi, C., & Nakazawa, K. 1976, PThPh, 56, 1756 . J Blum, G Wurm, 10.1006/icar.1999.6234Icar. 143138Blum, J., & Wurm, G. 2000, Icar, 143, 138 . P P Brown, D F Lawler, 10.1061/(ASCE)0733-9372(2003)129:3(222)J. Environ. Eng. 129222Brown, P. P., & Lawler, D. F. 2003, J. Environ. Eng., 129, 222 . E Chiang, A N Youdin, 10.1146/annurev-earth-040809-152513AREPS. 38493Chiang, E., & Youdin, A. N. 2010, AREP..."
1,226254396,"{'disclaimer': 'This content is derived from https://arxiv.org/abs/2011.02836. ', 'externalids': {'medline': None, 'mag': '3097105672', 'acl': None, 'doi': None, 'medrxiv': None, 'pubmedcentral': None, 'arxiv': '2011.02836'}, 'license': None, 'url': 'https://arxiv.org/abs/2011.02836', 'status': None}",Dynamically Throttleable Neural Networks (TNN),"[Hengyue Liu, Samyak Parajuli, Jesse Hostetler, S. Chai, B. Bhanu]","{'text': ' Introduction Deep learning models are typically trained offline to produce models with a static allocation of compute and memory resource. However, the conditions in real-world setting are often different, whereby the runtime inference is neither optimal from an accuracy or efficiency perspective. The problem lies in the current training approaches that produce static models that occupy a single point in the trade-space between performance and resource use. This paper presents an...","{'text': ' Deep elastic networks with model selection for multitask learning. C Ahn, E Kim, S Oh, Proceedings of the IEEE International Conference on Computer Vision (CVPR). the IEEE International Conference on Computer Vision (CVPR)2019 Estimating or propagating gradients through stochastic neurons for conditional computation. Y Bengio, N Léonard, A Courville, arXiv:1308.34322013arXiv preprint The OpenCV Library. G Bradski, Dr. Dobb's Journal of Software Tools. 2000 The Adaptive AI Appro..."
2,246138175,"{'disclaimer': 'This content is derived from https://doi.org/10.21203/rs.3.rs-1223190/v1. Its open-access license is CCBY.', 'externalids': {'medline': None, 'mag': None, 'acl': None, 'doi': '10.21203/rs.3.rs-1223190/v1', 'medrxiv': None, 'pubmedcentral': None, 'arxiv': None}, 'license': 'CCBY', 'url': 'https://doi.org/10.21203/rs.3.rs-1223190/v1', 'status': 'GREEN'}",Comparison of End-To-End Descending Hypoglossal-Facial Anastomosis And End-To-Side Hypoglossal-Facial Anastomosis For Facial Paralysis After Vestibular Schwannoma Surgery,"[Gang Song, Yi-Qin Zhou, Yafang Wu, Xiaolong Wu, Mingchu Li, Hong-chuan Guo, Ge Chen, Y. Bao, Jiantao Liang]","{'text': ' Background Facial paralysis is a severe complication of vestibular schwannoma (VS) surgery and signi cantly impacts quality of life. Hypoglossal-facial nerve anastomosis is one of the most effective methods for facial paralysis treatment after VS surgery [1][2][3][4][5][6]. However, classic hypoglossal-facial nerve anastomosis involves complete hypoglossal nerve transection, which leads to speech, masticatory and swallowing dysfunctions [7][8][9][10]. In the past three decades, s...","{'text': 'Acknowledgements: None.Availability of data and materials: The datasets used and/or analyzed during the current study are publicly available from the corresponding author.All data generated or analyzed during this study are included in thisarticle.Funding: This study was supported by the Beijing Medical Authority's ""Sailing"" Plan (XMLX201821).The sponsor had no role in the design or conduct of this research.Competing interests: The authors declare that there is no con ict of intere..."
3,89239448,"{'disclaimer': 'This content is derived from https://doi.org/10.4314/gjass.v15i1.5. Its open-access license is CCBY.', 'externalids': {'medline': None, 'mag': '2500539642', 'acl': None, 'doi': '10.4314/gjass.v15i1.5', 'medrxiv': None, 'pubmedcentral': None, 'arxiv': None}, 'license': 'CCBY', 'url': 'https://doi.org/10.4314/gjass.v15i1.5', 'status': 'HYBRID'}",The role of prostaglandins in livestock production,"[B. Okon, L. Ibom, A. Bassey, F. I. Okon]","{'text': ' INTRODUCTION Prostaglandins were first discovered and isolated from human semen in the 1930s by Ulf Von Euler of Sweden. Thinking they had come from the prostrate gland, he named them Prostaglandins. Prostaglandins are like hormones in that they act as chemical messengers, but do not move to other sites, but work right within the cells where they are synthesized (Ophardt, 2003). Von Euler (1937) also stated that biologically active lipids in human seminal plasma were first detect...","{'text': ' Theory of material recognition in swine based on estrogen controlled endocrine versus exocrine secretion of PGF 2 σ by uterine endometrium. F W Bazer, W W Thatcher, 1977 . Prostaglandin, 14 Luteolytic effects of prostaglandins in guinea pigs. F R Blatchley, Nature. 22110651969 The effects of prostaglandins and mating on release of LH in the female rabbit. J C Carlson, P Wong, D G Perrin, Journal of reproduction and Fertility. 511977a Inflammatory processes in pre-term and term..."
4,8180607,"{'disclaimer': 'This content is derived from https://arxiv.org/abs/1205.3752. ', 'externalids': {'medline': None, 'mag': '2137714710', 'acl': None, 'doi': None, 'medrxiv': None, 'pubmedcentral': None, 'arxiv': '1205.3752'}, 'license': None, 'url': 'https://arxiv.org/abs/1205.3752', 'status': None}",Revisiting Homomorphic Wavelet Estimation and Phase Unwrapping,"[R. H. Herrera, M. Baan]","{'text': ' Introduction We aim to create a new nonminimum-phase surface-consistent approach via log-spectral averaging by continuing research started by Tria et al. (2007). Nevertheless we have to deal previously with phase unwrapping problem as a vital step in log-spectral averaging and a long-standing problem in homomorphic deconvolution (Oppenheim et al., 1976;Lines, 1976;Tribolet, 1977). In this paper first we give a brief introduction to homomorphic deconvolution. We then provide the r...","{'text': 'AcknowledgementsThe authors thank the BLISS sponsors for financial support.We are, also gratefully to S. Kaplan for kindly supplying his Matlab code for the w-plane method. A Statistical Approach to the Extraction of the Seismic Propagating Wavelet. G Angeleri, Geophysical Prospecting. 311983 Source shape estimation and deconvolution of teleseismic bodywaves. R W Clayton, R A Wiggins, Geophysical Journal of the Royal Astronomical Society. 471976 Deconvolution and wavelet estimati..."


In [19]:
wr.athena.read_sql_query("""
SELECT
    id_openalex_short,
    id_doi_short,
    title,
    primary_topic_short_id,
    primary_topic_display_name,
    primary_topic_subfield_long_id,
    primary_topic_subfield_short_id,
    primary_topic_subfield_display_name,
    primary_topic_field_long_id,
    primary_topic_field_short_id,
    primary_topic_field_display_name,
    primary_topic_domain_long_id,
    primary_topic_domain_short_id,
    primary_topic_domain_display_name
 FROM 
     openalex_works_reduced
 WHERE
     primary_topic_domain_short_id IS NULL AND
     primary_topic_domain_long_id IS NOT NULL
 LIMIT 10
 """, '01_raw')

Unnamed: 0,id_openalex_short,id_doi_short,title,primary_topic_short_id,primary_topic_display_name,primary_topic_subfield_long_id,primary_topic_subfield_short_id,primary_topic_subfield_display_name,primary_topic_field_long_id,primary_topic_field_short_id,primary_topic_field_display_name,primary_topic_domain_long_id,primary_topic_domain_short_id,primary_topic_domain_display_name
0,1650023746,10.4237/sbqp.09.102,Possibilidades de aplicação de ferramentas de análise da ventilação natural durante a concepção de projetos arquitetônicos,T10121,Building Energy Efficiency and Thermal Comfort Optimization,2215,,Building and Construction,22,,Engineering,3,,Physical Sciences
1,2635976061,,Prediction of population performance based on reliability,T10968,Skew Distributions and Applications in Statistics,2613,,Statistics and Probability,26,,Mathematics,3,,Physical Sciences
2,2730528915,,Santé et travail bien fait : que peut-on apprendre d'autres métiers ?,T14186,Healthcare Policy Reforms and Inequalities in France,3600,,General Health Professions,36,,Health Professions,4,,Health Sciences
3,3178722630,,Persian Sentence-level Sentiment Polarity Classification,T10664,Sentiment Analysis and Opinion Mining,1702,,Artificial Intelligence,17,,Computer Science,3,,Physical Sciences
4,4288090456,,On the high temperature crack propagation in the nickel-based superalloy AD730TM,T13129,Materials Science and Technology,2500,,General Materials Science,25,,Materials Science,3,,Physical Sciences
5,4288104205,,A la rencontre du terahertz,T13807,Digital Communication and Information Studies,1705,,Computer Networks and Communications,17,,Computer Science,3,,Physical Sciences
6,4300432811,,Louis Darquier avant Darquier de Pellepoix. Les enseignements biographiques d'un manuscrit inédit de 1931,T13641,History of Science and Technology,1207,,History and Philosophy of Science,12,,Arts and Humanities,2,,Social Sciences
7,4366974590,,Un regard sur le travail,T11475,Territorial Governance and Environmental Participation,3312,,Sociology and Political Science,33,,Social Sciences,2,,Social Sciences
8,4387659385,,Why is agriculture heterogenous? A neoclassical view,T11743,Critique of Political Economy and Capitalist Development,3312,,Sociology and Political Science,33,,Social Sciences,2,,Social Sciences
9,1506636268,10.4237/sbqp.09.024,Avaliação das Habitações sob enfoque da cultura Guarani,T11858,Urban Geography and Social Development in Brazil,3322,,Urban Studies,33,,Social Sciences,2,,Social Sciences


In [25]:
utils.pd_set_options()
wr.athena.read_sql_query("""
WITH
raw_openalex_works_reduced AS 
(
    SELECT * FROM "01_raw"."openalex_works_reduced" LIMIT 5000
),
base_openalex_works_reduced AS 
(
    SELECT
        id_openalex_short AS id_openalex,
        id_doi_short AS id_doi,
        title,
        COALESCE(primary_topic_short_id, primary_topic_long_id) AS primary_topic_id,
        primary_topic_display_name,
        COALESCE(primary_topic_subfield_short_id, primary_topic_subfield_long_id) AS primary_topic_subfield_id,
        primary_topic_subfield_display_name,
        primary_topic_field_short_id,
        COALESCE(primary_topic_field_short_id, primary_topic_field_long_id) AS primary_topic_field_id,
        primary_topic_field_display_name,
        primary_topic_domain_short_id,
        COALESCE(primary_topic_domain_short_id, primary_topic_domain_long_id) AS primary_topic_domain_id,
        primary_topic_domain_display_name
    FROM
        raw_openalex_works_reduced
)
SELECT * FROM base_openalex_works_reduced WHERE primary_topic_field_short_id IS NULL AND primary_topic_field_display_name IS NOT NULL
""", '01_raw')

Unnamed: 0,id_openalex,id_doi,title,primary_topic_id,primary_topic_display_name,primary_topic_subfield_id,primary_topic_subfield_display_name,primary_topic_field_short_id,primary_topic_field_id,primary_topic_field_display_name,primary_topic_domain_short_id,primary_topic_domain_id,primary_topic_domain_display_name
0,2621689297,,Detection of abnormal aircraft control surface positions using a robust parametric test,T10876,Process Fault Detection and Diagnosis in Industries,2207,Control and Systems Engineering,,22,Engineering,,3,Physical Sciences
1,1650023746,10.4237/sbqp.09.102,Possibilidades de aplicação de ferramentas de análise da ventilação natural durante a concepção de projetos arquitetônicos,T10121,Building Energy Efficiency and Thermal Comfort Optimization,2215,Building and Construction,,22,Engineering,,3,Physical Sciences
2,2635976061,,Prediction of population performance based on reliability,T10968,Skew Distributions and Applications in Statistics,2613,Statistics and Probability,,26,Mathematics,,3,Physical Sciences
3,2730528915,,Santé et travail bien fait : que peut-on apprendre d'autres métiers ?,T14186,Healthcare Policy Reforms and Inequalities in France,3600,General Health Professions,,36,Health Professions,,4,Health Sciences
4,3178722630,,Persian Sentence-level Sentiment Polarity Classification,T10664,Sentiment Analysis and Opinion Mining,1702,Artificial Intelligence,,17,Computer Science,,3,Physical Sciences
5,4288090456,,On the high temperature crack propagation in the nickel-based superalloy AD730TM,T13129,Materials Science and Technology,2500,General Materials Science,,25,Materials Science,,3,Physical Sciences
6,4288104205,,A la rencontre du terahertz,T13807,Digital Communication and Information Studies,1705,Computer Networks and Communications,,17,Computer Science,,3,Physical Sciences
7,4300432811,,Louis Darquier avant Darquier de Pellepoix. Les enseignements biographiques d'un manuscrit inédit de 1931,T13641,History of Science and Technology,1207,History and Philosophy of Science,,12,Arts and Humanities,,2,Social Sciences
8,4366974590,,Un regard sur le travail,T11475,Territorial Governance and Environmental Participation,3312,Sociology and Political Science,,33,Social Sciences,,2,Social Sciences
9,4387659385,,Why is agriculture heterogenous? A neoclassical view,T11743,Critique of Political Economy and Capitalist Development,3312,Sociology and Political Science,,33,Social Sciences,,2,Social Sciences


In [10]:
utils.pd_set_options()
wr.athena.read_sql_query("""
WITH
base_openalex_works_reduced AS ()
grouped AS (
    SELECT
        --primary_topic_short_id,
        --primary_topic_display_name,
        primary_topic_subfield_short_id,
        primary_topic_subfield_long_id,
        primary_topic_subfield_display_name,
        primary_topic_field_short_id,
        primary_topic_field_long_id,
        primary_topic_field_display_name,
        primary_topic_domain_short_id,
        primary_topic_domain_long_id,
        primary_topic_domain_display_name,
        COUNT(*) AS c
     FROM
        openalex_works_reduced
     GROUP BY
        primary_topic_subfield_short_id,
        primary_topic_subfield_long_id,
        primary_topic_subfield_display_name,
        primary_topic_field_short_id,
        primary_topic_field_long_id,
        primary_topic_field_display_name,
        primary_topic_domain_short_id,
        primary_topic_domain_long_id,
        primary_topic_domain_display_name
)
SELECT 
    *
FROM
    grouped
ORDER BY
    primary_topic_domain_long_id,
    primary_topic_domain_short_id,
    primary_topic_field_long_id,
    primary_topic_field_short_id,
    primary_topic_subfield_long_id,
    primary_topic_subfield_short_id
 """, '01_raw')

Unnamed: 0,primary_topic_subfield_short_id,primary_topic_subfield_long_id,primary_topic_subfield_display_name,primary_topic_field_short_id,primary_topic_field_long_id,primary_topic_field_display_name,primary_topic_domain_short_id,primary_topic_domain_long_id,primary_topic_domain_display_name,c
0,,1102,Agronomy and Crop Science,,11,Agricultural and Biological Sciences,,1,Life Sciences,1
1,,1103,Animal Science and Zoology,,11,Agricultural and Biological Sciences,,1,Life Sciences,2
2,,1105,"Ecology, Evolution, Behavior and Systematics",,11,Agricultural and Biological Sciences,,1,Life Sciences,2
3,,1109,Insect Science,,11,Agricultural and Biological Sciences,,1,Life Sciences,1
4,,1304,Biophysics,,13,"Biochemistry, Genetics and Molecular Biology",,1,Life Sciences,1
5,,1310,Endocrinology,,13,"Biochemistry, Genetics and Molecular Biology",,1,Life Sciences,1
6,,1311,Genetics,,13,"Biochemistry, Genetics and Molecular Biology",,1,Life Sciences,2
7,,1312,Molecular Biology,,13,"Biochemistry, Genetics and Molecular Biology",,1,Life Sciences,3
8,,2403,Immunology,,24,Immunology and Microbiology,,1,Life Sciences,2
9,,1207,History and Philosophy of Science,,12,Arts and Humanities,,2,Social Sciences,1
