In [None]:
import requests
import json
import os
from dotenv import load_dotenv
import csv
import re
from time import sleep

load_dotenv()

True

We can visualize the databases of the API with the following code. We will be using the oldest version of the databases.

In [3]:
base_url = "https://api.semanticscholar.org/datasets/v1/release/"

# Set the release id we will work with the first release
release_id = "2022-05-10"

# Make a request to get datasets available the latest release
response = requests.get(base_url + release_id)

# Print the response data
print(json.dumps(response.json(), indent=2))
res=json.loads(response.text)["datasets"]
dbs = []
for dataset in res:
    dbs.append(dataset["name"])

{
  "release_id": "2022-05-10",
  "README": "Semantic Scholar Academic Graph Datasets\n\nThese datasets provide a variety of information about research papers taken from a snapshot in time of the Semantic Scholar corpus.\n\nThis site is provided by The Allen Institute for Artificial Intelligence (\u201cAI2\u201d) as a service to the\nresearch community. The site is covered by AI2 Terms of Use and Privacy Policy. AI2 does not claim\nownership of any materials on this site unless specifically identified. AI2 does not exercise editorial\ncontrol over the contents of this site. AI2 respects the intellectual property rights of others. If\nyou believe your copyright or trademark is being infringed by something on this site, please follow\nthe \"DMCA Notice\" process set out in the Terms of Use (https://allenai.org/terms).\n\nSAMPLE DATA ACCESS\nSample data files can be downloaded with the following UNIX command:\n\nfor f in $(curl https://s3-us-west-2.amazonaws.com/ai2-s2ag/samples/MANIFEST.

In [5]:
print(dbs)


['abstracts', 'authors', 'citations', 'embeddings', 'papers', 's2orc', 'tldrs']


Each database is really big and because of that it is divided different downloadable parts. Let's list the first link of each part, click on it to download it.

In [8]:
# This endpoint requires authentication via api key
api_key = os.environ.get('API_KEY')
headers = {"x-api-key": api_key}

for name in dbs:
    sleep(1)
    # Define dataset name you want to download
    dataset_name = name
    # Send the GET request and store the response in a variable
    response = requests.get(base_url + release_id + '/dataset/' + dataset_name, headers=headers)
    print(name+" download url:")
    url=json.loads(response.text)["files"][0]
    print(url)

abstracts download url:
https://ai2-s2ag.s3.amazonaws.com/staging/2022-05-10/abstracts/20220513_070629_00025_mtwkq_0b601c4a-dca2-4eab-921b-1c86bcba0147.gz?AWSAccessKeyId=ASIA5BJLZJPW7Z77QZKE&Signature=qWAs82Go2FzDcl%2F%2FYHQ%2FPooCUZo%3D&x-amz-security-token=IQoJb3JpZ2luX2VjELT%2F%2F%2F%2F%2F%2F%2F%2F%2F%2FwEaCXVzLXdlc3QtMiJHMEUCIQDmIUwh%2BrWCKDivZkQpX4g%2F4RpGGhC00WEo5WBU97rs5gIgN6rrh9112bHH%2FelAxkJK0%2BTrf8nYlqz3PeqgFqqYDpIqiAQI7f%2F%2F%2F%2F%2F%2F%2F%2F%2F%2FARAAGgw4OTYxMjkzODc1MDEiDEHK6Tmm3OMhu0BfwyrcA%2F6eyjpf463eYd%2FAf%2F45o2HcRsG4dZ8H%2BcB8Y5TYMKTcmEdFxGioGROea3hzYuy%2FvoIDtQMcu2igyw5gRI6kZMtBLtyVIUYG109IFEg9L1Sn92CO7foBGhNdN2aQZj40O7TfKIK3rLGHW6K%2Fs2ZEoRa%2FfNP25YkADBY%2FGnE6E6pNMDQ7Cxyn0Zz7FWUNPTzZp%2BWCrQa0gBgQl3eAIIFMO6Vn55BHQ7noQMAhCBIyoe42PlcG%2F%2F5RqFDFIlg2Jrshu3V0paDhDuk3YFnXllNRvjpmyjtriqJh7IN8ldakqmgZOOSUGa2z3scLoh3yirDmE3JuatLGNrXyTFdeFAjju8djWx5o5An8%2FOTaGSKTkhkNuunFu3waaFwNRkrSFXOOO6xd%2F74laim70m4EXpnu0bKVfp1WxTvBT6rCnE4xaB5ZOrXjwfTy7xcjqc7nEhif1pYb5QVveROFdNz

Put each json file in a folder called rawdata and change their names to the appropiate ones.

Now we will start cleaning the data, we will start with the papers and keywords. First let's define the conditions. 

In [None]:
# Define the condition for deleting a row
def delete_conditions(json_obj):
    if json_obj["publicationtypes"] == None:
        return True
    elif json_obj["authors"] == []:
        return True
    elif "Conference" in json_obj["publicationtypes"] and json_obj["venue"] == None:
        return True
    elif "Review" in json_obj["publicationtypes"] and json_obj["venue"] == None:
        return True
    elif "JournalArticle" in json_obj["publicationtypes"] and json_obj["journal"] == None:
        return True
    elif "Review" not in json_obj["publicationtypes"] and "Conference" not in json_obj["publicationtypes"] and "JournalArticle" not in json_obj["publicationtypes"]:
        return True
    elif json_obj["s2fieldsofstudy"] == None:
        return True

In [None]:
input_file = "rawdata/papers"   # Path to your JSONL file
output_file = "csv/papers.csv"  # Output file where changes will be saved
keywords_file="csv/keywords.csv"
RECORDS = 15000  # Number of records to save    
count=0     
# Read, modify, and save the updated JSONL content
with open(input_file, "r", encoding="utf-8") as infile,open(output_file, "w", newline='', encoding="utf-8") as outfile, open(keywords_file, "w", newline='', encoding="utf-8") as keyfile:
         
    csv_writer_1 = csv.DictWriter(outfile, fieldnames=["sid","corpusid", "title", "authorId", "authorName", "url", "year", "referencecount", "citationcount", "influentialcitationcount","publicationtype", "publicationdate", "venue", "publicationvenueid", "journalName", "journalVolume", "journalPages"])
    csv_writer_2 = csv.DictWriter(keyfile, fieldnames=["sid", "keyword"])  
    
    # Write the headers to the CSV files
    csv_writer_1.writeheader()
    csv_writer_2.writeheader()
    
    for line in infile:
        if count >= RECORDS:
            break
        try:
            # Parse the JSON object
            json_line = json.loads(line)        
            # Check if the row matches the deletion condition
            if not delete_conditions(json_line):
                count+=1
                json_line["authorId"] = json_line["authors"][0]["authorId"]
                json_line["authorName"] = json_line["authors"][0]["name"]
                if "Conference" in json_line["publicationtypes"]:
                    json_line["publicationtypes"]="Conference"
                    json_line["journalName"]=None
                    json_line["journalVolume"]=None
                    json_line["journalPages"]=None
                elif "Review" in json_line["publicationtypes"]:
                    json_line["publicationtypes"]="Workshop"
                    json_line["journalName"]=None
                    json_line["journalVolume"]=None
                    json_line["journalPages"]=None 
                elif "JournalArticle" in json_line["publicationtypes"]:
                    json_line["publicationtypes"]="JournalArticle"
                    json_line["venue"]=None
                    json_line["publicationvenueid"]=None
                    json_line["journalName"]=json_line["journal"]["name"]
                    json_line["journalVolume"]=json_line["journal"]["volume"]
                    if json_line["journal"]["pages"] is not None:
                        json_line["journalPages"]=re.sub(r'\s+', '', json_line["journal"]["pages"])
                    else:
                        json_line["journalPages"]=None
                
                row_papers = {
                "sid": count, # Add a new column with a surrogated ID, just in case
                "corpusid": json_line.get("corpusid"),
                "title":  json_line.get("title").strip().replace("\n", " "),
                "authorId": json_line.get("authorId"),
                "authorName": json_line.get("authorName"),
                "url": json_line.get("url"),
                "year": json_line.get("year"),
                "referencecount": json_line.get("referencecount"),
                "citationcount": json_line.get("citationcount"),
                "influentialcitationcount": json_line.get("influentialcitationcount"),
                "publicationtype": json_line.get("publicationtypes"),
                "publicationdate": json_line.get("publicationdate"),
                "venue": json_line.get("venue", ''),
                "publicationvenueid": json_line.get("publicationvenueid", ''),
                "journalName": json_line.get("journalName"),
                "journalVolume": json_line.get("journalVolume"),
                "journalPages": json_line.get("journalPages")
                }
                # Write the row to CSV 1
                csv_writer_1.writerow(row_papers)

                keywords=list()
                for keyword in json_line.get("s2fieldsofstudy", []):
                    keywords.append(keyword["category"])
                keywords = list(set(keywords))
                for keyword in keywords:
                    row_keywords = {
                        "sid": count, # We will use the surrogated ID here, just in case
                        "keyword": keyword
                    }
                    # Write the row to CSV 2 for each keyword
                    csv_writer_2.writerow(row_keywords)
                
        except json.JSONDecodeError as e:
            print(f"Error decoding JSON: {e}")
print(f"Modified JSONL saved to {output_file}")

Modified JSONL saved to csv/papers.csv


In [20]:
#response = requests.get("https://api.semanticscholar.org/graph/v1/paper/corpusid:238221832", headers=headers)

r = requests.post(
    'https://api.semanticscholar.org/graph/v1/paper/batch',
    params={'fields': 'abstract,references'},
    json={"ids": ["649def34f8be52c8b66281af98ae884c09aef38b", "ARXIV:2106.15928"]}
)
print(json.dumps(r.json(), indent=2))

[
  {
    "paperId": "649def34f8be52c8b66281af98ae884c09aef38b",
    "abstract": "We describe a deployed scalable system for organizing published scientific literature into a heterogeneous graph to facilitate algorithmic manipulation and discovery. The resulting literature graph consists of more than 280M nodes, representing papers, authors, entities and various interactions between them (e.g., authorships, citations, entity mentions). We reduce literature graph construction into familiar NLP tasks (e.g., entity extraction and linking), point out research challenges due to differences from standard formulations of these tasks, and report empirical results for each task. The methods described in this paper are used to enable semantic features in www.semanticscholar.org.",
    "references": [
      {
        "paperId": "1fec9d41d372267b4474f18cbeadd806c8b67adb",
        "title": "Extracting Scientific Figures with Distantly Supervised Neural Networks"
      },
      {
        "paperId": 