In [1]:
import requests
import boto3
import json
import os
import sys
import argparse
import importlib
import subprocess
from IPython.display import display
# 3
# Adding ../01_modules or ./01_modules to the system path so that we can load modules from 
# there as well
modules_path_in_notebook = os.path.abspath(os.path.join(os.getcwd(), '..', '01_modules'))
modules_path_in_processing_script = os.path.abspath(os.path.join(os.getcwd(), '01_modules'))
if os.path.exists(modules_path_in_notebook):
    sys.path.append(modules_path_in_notebook)
if os.path.exists(modules_path_in_processing_script):
    sys.path.append(modules_path_in_processing_script)

result = subprocess.run(["pwd"], capture_output=True, text=True)
print(result.stdout)
result = subprocess.run(["ls", "-l"], capture_output=True, text=True)
print(result.stdout)
# # Jupyter only reads a local module the first time after 
# # kernel start. Re-running a cell with 
# # "from mymodulename import *" would not change
# # anything, even if the imported module has since changed.
# # As a workaround, we need to directly load the module, 
# # use importlib.reload to reload it and then import * 
import utils
_ = importlib.reload(utils)
import config
_ = importlib.reload(config) 

utils.py loaded: v0.2.12
utils.py loaded: v0.2.12
config.py loaded: v0.1
config.py loaded: v0.1


In [2]:
semanticscholar_secret = utils.get_secret(config.AWS_REGION, config.SEMANTICSCHOLAR_SECRET_NAME)
SEMANTICSCHOLAR_API_KEY = semanticscholar_secret[config.SEMANTICSCHOLAR_SECRET_KEY]
print(f'Keys stored in Secrets Managers for the secret "{config.SEMANTICSCHOLAR_SECRET_NAME}":', list(semanticscholar_secret.keys()))

Keys stored in Secrets Managers for the secret "semanticscholar_api_key": ['x-api-key']


In [3]:
def semanticscholar_get_release_ids():
    """Fetching the list of dataset release IDs."""
    response = requests.get(config.SEMANTICSCHOLAR_API_BASE_URL)
    response.raise_for_status()
    res = response.json()
    print(res)
    return res


def semanticscholar_get_latest_metadata(release_id):
    """Fetch the metadata for the latest dataset release."""
    url = f'{config.SEMANTICSCHOLAR_API_BASE_URL}/{release_id}'
    response = requests.get(url)
    response.raise_for_status()
    res_json = response.json()
    res = []
    for dataset in res_json['datasets']:
        if dataset['name'] in ['papers', 's2orc']:
            res.append(dataset)
            display(dataset)
    return res


def get_releases_and_metadata():
    release_ids = semanticscholar_get_release_ids()
    latest_release_id = release_ids[-1]

    semanticscholar_get_latest_metadata(latest_release_id)
    return latest_release_id


SEMANTICSCHOLAR_LATEST_RELEASE_ID = get_releases_and_metadata()
SEMANTICSCHOLAR_LATEST_RELEASE_ID

['2022-05-10', '2022-05-17', '2022-05-24', '2022-05-31', '2022-06-07', '2022-06-14', '2022-06-21', '2022-06-28', '2022-07-05', '2022-07-19', '2022-07-28', '2022-08-02', '2022-08-09', '2022-08-16', '2022-08-23', '2022-08-30', '2022-09-06', '2022-09-13', '2022-09-28', '2022-10-05', '2022-10-28', '2022-11-02', '2022-11-11', '2022-11-15', '2022-11-22', '2022-12-02', '2022-12-06', '2022-12-13', '2022-12-20', '2022-12-27', '2023-01-03', '2023-01-10', '2023-01-17', '2023-01-24', '2023-01-31', '2023-02-07', '2023-02-14', '2023-02-21', '2023-02-28', '2023-03-07', '2023-03-14', '2023-03-21', '2023-03-28', '2023-04-06', '2023-04-11', '2023-04-18', '2023-05-09', '2023-05-16', '2023-05-23', '2023-05-30', '2023-06-06', '2023-06-13', '2023-06-20', '2023-07-04', '2023-07-11', '2023-07-25', '2023-08-01', '2023-08-08', '2023-08-15', '2023-08-29', '2023-09-05', '2023-09-12', '2023-09-19', '2023-09-26', '2023-10-10', '2023-10-19', '2023-10-24', '2023-10-31', '2023-11-07', '2023-11-14', '2023-11-21', '2023

{'name': 'papers',
 'description': 'The core attributes of a paper (title, authors, date, etc.).\n200M records in 30 1.5GB files.',
 'README': 'Semantic Scholar Academic Graph Datasets\n\nThe "papers" dataset provides core metadata about papers.\n\nSCHEMA\nSee https://api.semanticscholar.org/api-docs/graph#tag/Paper-Data\n\nThis dataset does not contain information about a paper\'s references or citations.\nInstead, join with citingPaperId/citedPaperId from the "citations" dataset.\n\nLICENSE\nThis collection is licensed under ODC-BY. (https://opendatacommons.org/licenses/by/1.0/)\n\nBy downloading this data you acknowledge that you have read and agreed to all the terms in this license.\n\nATTRIBUTION\nWhen using this data in a product or service, or including data in a redistribution, please cite the following paper:\n\nBibTex format:\n@misc{https://doi.org/10.48550/arxiv.2301.10140,\n  title = {The Semantic Scholar Open Data Platform},\n  author = {Kinney, Rodney and Anastasiades, Ch

{'name': 's2orc',
 'description': 'Full-body paper text parsed from open-access PDFs. Identifies structural elements such as paragraphs, sections, and bibliography entries.\n10M records in 30 4GB files.',
 'README': 'Semantic Scholar Academic Graph Datasets\n\nThe "s2orc" dataset contains parsed full-body text from selected papers.\n\nA subset of this data was previously released (in a different format) as S2ORC https://github.com/allenai/s2orc\n\nThe body text is parsed from PDF documents using Grobid, documented at https://grobid.readthedocs.io.\nIts output is converted from XML into a single string with a set of annotation spans.\n\nSCHEMA\n - externalIds: IDs of this paper in different catalogs\n - content:\n   - source:\n\t   - pdfUrls: URLs to the PDF\n\t   - oaInfo: license/url/status information from Unpaywall\n   - text: Full body text as a single string\n   - annotations: Annotated spans of the full body text\n\n\nLICENSE\nThis collection is licensed under ODC-BY. (https://op

'2025-08-12'

In [8]:
parser = argparse.ArgumentParser()
parser.add_argument("--runtype", type=str, default='dev')
parser.add_argument("--test-argument-key-01", type=str, default='test-argument-default-value-01')
parser.add_argument("--test-argument-key-02", type=str, default='test-argument-default-value-02')
args, _ = parser.parse_known_args()
RUNTYPE = args.runtype

print (args)

if RUNTYPE == 'dev':
    PROCESSING_FILEPATH_PREFIX = '_dev_processing'
elif RUNTYPE == 'prod':
    PROCESSING_FILEPATH_PREFIX = config.DEFAULT_PROCESSING_FILEPATH_PREFIX
else:
    raise ValueError('Argument --runtype should be either "dev" or "prod" (without quotes).')

PROCESSING_FILEPATH_INPUT = f'{PROCESSING_FILEPATH_PREFIX}/input/data/'
PROCESSING_FILEPATH_OUTPUT = f'{PROCESSING_FILEPATH_PREFIX}/output/results/'

utils.ensure_path(PROCESSING_FILEPATH_INPUT)
utils.ensure_path(PROCESSING_FILEPATH_OUTPUT)

with open(os.path.join(PROCESSING_FILEPATH_INPUT, 'test.txt'), "r", encoding='utf-8') as test_file:
    test_content = test_file.read()

with open(os.path.join(PROCESSING_FILEPATH_OUTPUT, 'results.txt'), "w", encoding='utf-8') as results_file:
    results_file.write('\n'.join([
        f'semanticscholar_secret_keys[0]: {list(semanticscholar_secret.keys())[0]}',
        f'SEMANTICSCHOLAR_LATEST_RELEASE_ID: {SEMANTICSCHOLAR_LATEST_RELEASE_ID}',
        f'test_content: {test_content}',
        f'args.test_argument_key_01: {args.test_argument_key_01}',
        f'args.test_argument_key_02: {args.test_argument_key_02}'
    ]))

Namespace(runtype='dev', test_argument_key_01='test-argument-default-value-01', test_argument_key_02='test-argument-default-value-02')
