In [14]:
import requests
import boto3
import json
from botocore.exceptions import ClientError

In [15]:
AWS_REGION = "eu-west-2" # London
TARGET_BUCKET = "steve-sagemaker-data-bucket"
API_BASE_URL = "https://api.semanticscholar.org/datasets/v1/release"

SECRET_NAME = 'semanticscholar_api_key'
SECRET_KEY = 'x-api-key'

In [18]:
def get_secret(region_name, secret_name):
    # Source of the function:
    # https://eu-west-2.console.aws.amazon.com/secretsmanager/secret?name=semanticscholar_api_key#secret-details-sample-code-section
    # Create a Secrets Manager client
    session = boto3.session.Session()
    client = session.client(
        service_name='secretsmanager',
        region_name=region_name
    )

    try:
        get_secret_value_response = client.get_secret_value(
            SecretId=secret_name
        )
    except ClientError as e:
        # For a list of exceptions thrown, see
        # https://docs.aws.amazon.com/secretsmanager/latest/apireference/API_GetSecretValue.html
        raise e

    secret = json.loads(get_secret_value_response['SecretString'])
    return secret

S2_API_KEY = get_secret(AWS_REGION, SECRET_NAME)[SECRET_KEY]

In [16]:
def get_release_ids():
    """Fetching the list of dataset release IDs."""
    response = requests.get(API_BASE_URL)
    response.raise_for_status()
    res = response.json()
    print(res)
    return res


def get_latest_metadata(release_id):
    """Fetch the metadata for the latest dataset release."""
    url = f'{API_BASE_URL}/{release_id}'
    response = requests.get(url)
    response.raise_for_status()
    res_json = response.json()
    res = []
    for dataset in res_json['datasets']:
        if dataset['name'] in ['papers', 's2orc']:
            res.append(dataset)
            display(dataset)
    return res
    

def get_releases_and_metadata():
    release_ids = get_release_ids()
    latest_release_id = release_ids[-1]

    get_latest_metadata(latest_release_id)
    return latest_release_id


S2_LATEST_RELEASE_ID = get_releases_and_metadata()

['2022-05-10', '2022-05-17', '2022-05-24', '2022-05-31', '2022-06-07', '2022-06-14', '2022-06-21', '2022-06-28', '2022-07-05', '2022-07-19', '2022-07-28', '2022-08-02', '2022-08-09', '2022-08-16', '2022-08-23', '2022-08-30', '2022-09-06', '2022-09-13', '2022-09-28', '2022-10-05', '2022-10-28', '2022-11-02', '2022-11-11', '2022-11-15', '2022-11-22', '2022-12-02', '2022-12-06', '2022-12-13', '2022-12-20', '2022-12-27', '2023-01-03', '2023-01-10', '2023-01-17', '2023-01-24', '2023-01-31', '2023-02-07', '2023-02-14', '2023-02-21', '2023-02-28', '2023-03-07', '2023-03-14', '2023-03-21', '2023-03-28', '2023-04-06', '2023-04-11', '2023-04-18', '2023-05-09', '2023-05-16', '2023-05-23', '2023-05-30', '2023-06-06', '2023-06-13', '2023-06-20', '2023-07-04', '2023-07-11', '2023-07-25', '2023-08-01', '2023-08-08', '2023-08-15', '2023-08-29', '2023-09-05', '2023-09-12', '2023-09-19', '2023-09-26', '2023-10-10', '2023-10-19', '2023-10-24', '2023-10-31', '2023-11-07', '2023-11-14', '2023-11-21', '2023

{'name': 'papers',
 'description': 'The core attributes of a paper (title, authors, date, etc.).\n200M records in 30 1.5GB files.',
 'README': 'Semantic Scholar Academic Graph Datasets\n\nThe "papers" dataset provides core metadata about papers.\n\nSCHEMA\nSee https://api.semanticscholar.org/api-docs/graph#tag/Paper-Data\n\nThis dataset does not contain information about a paper\'s references or citations.\nInstead, join with citingPaperId/citedPaperId from the "citations" dataset.\n\nLICENSE\nThis collection is licensed under ODC-BY. (https://opendatacommons.org/licenses/by/1.0/)\n\nBy downloading this data you acknowledge that you have read and agreed to all the terms in this license.\n\nATTRIBUTION\nWhen using this data in a product or service, or including data in a redistribution, please cite the following paper:\n\nBibTex format:\n@misc{https://doi.org/10.48550/arxiv.2301.10140,\n  title = {The Semantic Scholar Open Data Platform},\n  author = {Kinney, Rodney and Anastasiades, Ch

{'name': 's2orc',
 'description': 'Full-body paper text parsed from open-access PDFs. Identifies structural elements such as paragraphs, sections, and bibliography entries.\n10M records in 30 4GB files.',
 'README': 'Semantic Scholar Academic Graph Datasets\n\nThe "s2orc" dataset contains parsed full-body text from selected papers.\n\nA subset of this data was previously released (in a different format) as S2ORC https://github.com/allenai/s2orc\n\nThe body text is parsed from PDF documents using Grobid, documented at https://grobid.readthedocs.io.\nIts output is converted from XML into a single string with a set of annotation spans.\n\nSCHEMA\n - externalIds: IDs of this paper in different catalogs\n - content:\n   - source:\n\t   - pdfUrls: URLs to the PDF\n\t   - oaInfo: license/url/status information from Unpaywall\n   - text: Full body text as a single string\n   - annotations: Annotated spans of the full body text\n\n\nLICENSE\nThis collection is licensed under ODC-BY. (https://op

In [None]:
def get_dataset_download_urls(release_id, dataset_name):
    """Fetch the download URL for a specific dataset."""
    url = f'{API_BASE_URL}/{release_id}/dataset/{dataset_name}'
    response = requests.get(url, headers={'x-api-key': S2_API_KEY}
)
    response.raise_for_status()
    print(json.dumps(response.json(), indent=2, default=str))
    return response.json()['files']


get_dataset_download_urls(S2_LATEST_RELEASE_ID, 'papers')

{
  "name": "papers",
  "description": "The core attributes of a paper (title, authors, date, etc.).\n200M records in 30 1.5GB files.",
  "README": "Semantic Scholar Academic Graph Datasets\n\nThe \"papers\" dataset provides core metadata about papers.\n\nSCHEMA\nSee https://api.semanticscholar.org/api-docs/graph#tag/Paper-Data\n\nThis dataset does not contain information about a paper's references or citations.\nInstead, join with citingPaperId/citedPaperId from the \"citations\" dataset.\n\nLICENSE\nThis collection is licensed under ODC-BY. (https://opendatacommons.org/licenses/by/1.0/)\n\nBy downloading this data you acknowledge that you have read and agreed to all the terms in this license.\n\nATTRIBUTION\nWhen using this data in a product or service, or including data in a redistribution, please cite the following paper:\n\nBibTex format:\n@misc{https://doi.org/10.48550/arxiv.2301.10140,\n  title = {The Semantic Scholar Open Data Platform},\n  author = {Kinney, Rodney and Anastasi

KeyError: 'downloadUrl'

In [None]:
def download_and_upload_to_s3(url, dataset_id, target_bucket, aws_region="us-west-2", force_overwrite=False):
    """
    Downloads a file from a signed S3 URL and uploads it to another S3 bucket.

    Args:
        url (str): The signed S3 URL of the file to download.
        dataset_id (str): The dataset ID to use as the prefix for the target key.
        target_bucket (str): The name of the target S3 bucket.
        aws_region (str): The AWS region of the target S3 bucket.
        force_overwrite (bool): If False, skip downloading and uploading if the file already exists in S3.

    Returns:
        None
    """
    # Extract the filename from the URL
    filename = f"{url.split('/')[-1].split('?')[0]}"
    target_key = f"{dataset_id}/{filename}"

    # Step 1: Check if the file already exists in S3
    s3_client = boto3.client("s3", region_name=aws_region)
    if not force_overwrite:
        try:
            s3_client.head_object(Bucket=target_bucket, Key=target_key)
            print(f"File already exists in S3: s3://{target_bucket}/{target_key}. Skipping download and upload.")
            return
        except ClientError as e:
            if e.response['Error']['Code'] != "404":
                print(f"Error checking file in S3: {e}")
                return

    # Step 2: Download the file locally
    local_file = filename  # Use the filename as the local file name
    try:
        print(f"Downloading file from {url}...")
        response = requests.get(url, stream=True)
        response.raise_for_status()
        with open(local_file, "wb") as file:
            for chunk in response.iter_content(chunk_size=8192):
                file.write(chunk)
        print(f"File downloaded: {local_file}")
    except Exception as e:
        print(f"Error downloading file: {e}")
        return

    # Step 3: Upload the file to the target S3 bucket
    try:
        print(f"Uploading {local_file} to S3 bucket {target_bucket}...")
        s3_client.upload_file(local_file, target_bucket, target_key)
        print(f"File uploaded to S3: s3://{target_bucket}/{target_key}")
    except Exception as e:
        print(f"Error uploading file to S3: {e}")
    finally:
        # Clean up local file
        if os.path.exists(local_file):
            os.remove(local_file)

def process_files(file_urls, dataset_id, target_bucket, aws_region="us-east-1", force_overwrite=False):
    """
    Processes a list of signed S3 URLs and uploads them to the target S3 bucket.

    Args:
        file_urls (list): List of signed S3 URLs.
        dataset_id (str): The dataset ID to use as the prefix for the target key.
        target_bucket (str): The name of the target S3 bucket.
        aws_region (str): The AWS region of the target S3 bucket.
        force_overwrite (bool): If False, skip downloading and uploading if the file already exists in S3.

    Returns:
        None
    """
    for url in file_urls:
        download_and_upload_to_s3(url, dataset_id, target_bucket, aws_region, force_overwrite)

if __name__ == "__main__":
    # Example usage
    file_urls = [
        "https://example.com/path/to/file1.gz?AWSAccessKeyId=ACCESS_KEY&Signature=SIGNATURE&x-amz-security-token=TOKEN",
        "https://example.com/path/to/file2.gz?AWSAccessKeyId=ACCESS_KEY&Signature=SIGNATURE&x-amz-security-token=TOKEN",
    ]
    dataset_id = "papers"
    target_bucket = "steve-sagemaker-data-bucket"
    process_files(file_urls, dataset_id, target_bucket, aws_region="eu-west-2", force_overwrite=False)

'7cLxyta5bC7uiGlLQnI6daTQftTSvVIdaoWARlQa'