In [36]:
import requests
import json
import os
from dotenv import load_dotenv
import csv
import re
import pandas as pd
import urllib.parse

We can visualize the databases of the API with the following code. We will be using the oldest version of the databases.

In [2]:
base_url = "https://api.semanticscholar.org/datasets/v1/release/"

# Set the release id we will work with the first release
release_id = "2022-05-10"

# Make a request to get datasets available the latest release
response = requests.get(base_url + release_id)

# Print the response data
print(json.dumps(response.json(), indent=2))
res=json.loads(response.text)["datasets"]
dbs = []
for dataset in res:
    dbs.append(dataset["name"])

{
  "release_id": "2022-05-10",
  "README": "Semantic Scholar Academic Graph Datasets\n\nThese datasets provide a variety of information about research papers taken from a snapshot in time of the Semantic Scholar corpus.\n\nThis site is provided by The Allen Institute for Artificial Intelligence (\u201cAI2\u201d) as a service to the\nresearch community. The site is covered by AI2 Terms of Use and Privacy Policy. AI2 does not claim\nownership of any materials on this site unless specifically identified. AI2 does not exercise editorial\ncontrol over the contents of this site. AI2 respects the intellectual property rights of others. If\nyou believe your copyright or trademark is being infringed by something on this site, please follow\nthe \"DMCA Notice\" process set out in the Terms of Use (https://allenai.org/terms).\n\nSAMPLE DATA ACCESS\nSample data files can be downloaded with the following UNIX command:\n\nfor f in $(curl https://s3-us-west-2.amazonaws.com/ai2-s2ag/samples/MANIFEST.

Each database is really big and because of that it is divided different downloadable parts. Let's list the first link of each part, click on it to download it.

In [3]:
load_dotenv()
# This endpoint requires authentication via api key
api_key = os.environ.get('API_KEY')
headers = {"x-api-key": api_key}

for name in dbs:
    # Define dataset name you want to download
    dataset_name = name
    # Send the GET request and store the response in a variable
    response = requests.get(base_url + release_id + '/dataset/' + dataset_name, headers=headers)
    print(name+" download url:")
    url=json.loads(response.text)["files"][0]
    print(url)

abstracts download url:
https://ai2-s2ag.s3.amazonaws.com/staging/2022-05-10/abstracts/20220513_070629_00025_mtwkq_0b601c4a-dca2-4eab-921b-1c86bcba0147.gz?AWSAccessKeyId=ASIA5BJLZJPWV4IQXRFZ&Signature=ub6RkmV1DIeT7YctnfZDpGKeMT0%3D&x-amz-security-token=IQoJb3JpZ2luX2VjEMv%2F%2F%2F%2F%2F%2F%2F%2F%2F%2FwEaCXVzLXdlc3QtMiJHMEUCIH8fAKDF0J3RAmWi%2FFCeSg0zCX1ETGWnHOZ%2BgDTgeWyqAiEA0EvDToJJR2uICdIfo5gc7yScXg%2FyCPi%2Bz3UUeRqCWjwq%2FwMIFBAAGgw4OTYxMjkzODc1MDEiDA9dRDpcG%2FWDtnl1eircA4KFxd3%2BpoQ2jl1Im2eQPtyq20ixJx6kOw1e3lnwU1nSHG0vkIz3TTrSrFfAlW01B%2B82PKmPjARceLzCRIdH811%2FI2STtNiTopP4A02LpSiq1zW4M1OlbHtXq8QknsyKPkWehzPjQHoNSa%2FjPSGC2ymBnZqmC6mIQREln2iXnnoN9qUWvBQIJ03Gvs%2FBJrg1kYciuChnXQqxpVyHGjP%2FkXOBZvW%2FsPHP5q1gB2UOcPvYRMVe2ABF5Tn%2Fiw59Z231aEuH3MjcH5UvWGc4Hy9JF2eqpde7b49dOeHTDDdyFDch28OqMbVg%2BP7m6HYPt%2BZv8u9dgZA%2FflQWmqBGpbzLZD8HH%2FLskBNSvAy5SfxkpF%2BNdx5PvDBUP2ehJBdALxhHq%2BrdE35URlXU8XM8Mga1kogILTRJtqrqo6MozfcFMZvQoVet0WQ%2FlOtHyG2GpthLQ9L4NxMyLZblzj7DSnL8z%2FqGUjicdCMxN9MRyo1Jh8A

Put each json file in a folder called rawdata and change their names to the appropiate ones.

Now we will start cleaning the data, we will start with the papers and keywords. First let's define the conditions. 

In [73]:
# Define the condition for deleting a row
def conference_conditions(json_obj):
    if json_obj["authors"] == []:
        return True
    elif json_obj["venue"] == None:
        return True
    elif json_obj["s2FieldsOfStudy"] == None:
        return True

def journal_conditions(json_obj):
    if json_obj["authors"] == []:
        return True 
    elif json_obj["journal"] == None or "name" not in json_obj["journal"] or "pages" not in json_obj["journal"] or "volumen" not in json_obj["journal"]:
        return True
    elif json_obj["s2FieldsOfStudy"] == None:
        return True
    
def save_to_csv(json_obj, filename):

                  
                
                row_papers = {
                "sid": count, # Add a new column with a surrogated ID, just in case
                "corpusid": json_line.get("corpusid"),
                "title":  json_line.get("title").strip().replace("\n", " "),
                "authorId": json_line.get("authorId"),
                "authorName": json_line.get("authorName"),
                "url": json_line.get("url"),
                "year": json_line.get("year"),
                "referencecount": json_line.get("referencecount"),
                "citationcount": json_line.get("citationcount"),
                "influentialcitationcount": json_line.get("influentialcitationcount"),
                "publicationtype": json_line.get("publicationtypes"),
                "publicationdate": json_line.get("publicationdate"),
                "venue": json_line.get("venue", ''),
                "publicationvenueid": json_line.get("publicationvenueid", ''),
                "journalName": json_line.get("journalName"),
                "journalVolume": json_line.get("journalVolume"),
                "journalPages": json_line.get("journalPages")
                }
                # Write the row to CSV 1
                csv_writer_1.writerow(row_papers)

                keywords=list()
                for keyword in json_line.get("s2fieldsofstudy", []):
                    keywords.append(keyword["category"])
                keywords = list(set(keywords))
                for keyword in keywords:
                    row_keywords = {
                        "sid": count, # We will use the surrogated ID here, just in case
                        "keyword": keyword
                    }
                    # Write the row to CSV 2 for each keyword
                    csv_writer_2.writerow(row_keywords)

In [None]:
output_file = "csv/papers.csv"  
keywords_file="csv/keywords.csv"

api_key = os.environ.get('API_KEY')
headers = {"x-api-key": api_key}

#********************************************************************************************************************
RECORDS = 50  # Number of records to save per cathegory 
PUBLICATION_TYPES = ["Conference", "JournalArticle"]  # Publication types to filter (There is not Workshop)
QUERY = "data"  # Query to filter the papers
FIELDS = "paperId,corpusId,title,abstract,authors,url,year,referenceCount,citationCount,influentialCitationCount,s2FieldsOfStudy,publicationDate,journal,venue,publicationVenue"  # Fields to retrieve from the API
#********************************************************************************************************************

query_encoded = urllib.parse.quote(QUERY)
fields_encoded = urllib.parse.quote(FIELDS)

count = 0

with open(output_file, "w", newline='', encoding="utf-8") as outfile  :   
    csv_writer_1 = csv.DictWriter(outfile, fieldnames=["sid","paperId","corpusId", "title", "authorId", "authorName", "url", "year", "referenceCount", "citationCount", "influentialCitationCount","publicationType", "publicationDate"])

    # Write the headers to the CSV files
    csv_writer_1.writeheader()
    for publication_type in PUBLICATION_TYPES:
        type_encoded = urllib.parse.quote(publication_type)
        url1="https://api.semanticscholar.org/graph/v1/paper/search?query="+query_encoded+"&publicationTypes="+type_encoded+"&fields="+fields_encoded+"&limit="+str(RECORDS)
        response = requests.get(url1, headers=headers).json()
        for line in response["data"]:
            count += 1
            try:  
                if publication_type=="Conference":
                    if not conference_conditions(line):
                        line["publicationType"]="Conference"
                        line["journalName"]=None
                        line["journalVolume"]=None
                        line["journalPages"]=None
                elif publication_type=="JournalArticle":
                    if not journal_conditions(line):
                        line["publicationType"]="JournalArticle"
                        line["venue"]=None
                        line["journalName"]=line["journal"]["name"]
                        line["journalVolume"]=line["journal"]["volume"]
                        if line["journal"]["pages"] is not None:
                            line["journalPages"]=re.sub(r'\s+', '', line["journal"]["pages"])
                        else:
                            line["journalPages"]=None    
                line["authorId"] = line["authors"][0]["authorId"]
                line["authorName"] = line["authors"][0]["name"]
                row_papers = {
                    "sid": count, # Add a new column with a surrogated ID, just in case
                    "paperId": line.get("paperId"),
                    "corpusId": line.get("corpusId"),
                    "title":  line.get("title").strip().replace("\n", " "),
                    "authorId": line.get("authorId"),
                    "authorName": line.get("authorName"),
                    "url": line.get("url"),
                    "year": line.get("year"),
                    "referenceCount": line.get("referenceCount"),
                    "citationCount": line.get("citationCount"),
                    "influentialCitationCount": line.get("influentialCitationCount"),
                    "publicationType": line.get("publicationType"),
                    "publicationDate": line.get("publicationDate"),
                }
                csv_writer_1.writerow(row_papers)
            except json.JSONDecodeError as e:
                print(f"Error decoding JSON: {e}")
print(f"Modified JSONL saved to {output_file}")


Modified JSONL saved to csv/papers.csv


In [34]:
output_file="csv/citations.csv"
papers_file = "csv/papers.csv"  


df=pd.read_csv(papers_file)
ids=df["corpusid"].values.tolist()

api_key = os.environ.get('API_KEY')
headers = {"x-api-key": api_key}

url = "https://api.semanticscholar.org/graph/v1/6432563/citations"
print(url)

response = requests.get(url, headers=headers).json()
print(json.dumps(response, indent=2))

"""
for id in ids: 
    url = "https://api.semanticscholar.org/graph/v1/"+str(id)+"/citations"
    print(url)

    response = requests.get(url, headers=headers).json()
    print(json.dumps(response, indent=2))
# Save the results to json file

with open(output_file, "w", newline='', encoding="utf-8") as outfile  :   
    csv_writer_1 = csv.DictWriter(outfile, fieldnames=["sid","authorId", "url", "name", "paperCount", "hIndex"])

    # Write the headers to the CSV files
    csv_writer_1.writeheader()
    for line in response:
        count+=1
        try:  
            row_papers = {
                        "sid": count, # Add a new column with a surrogated ID, just in case
                        "authorId": line.get("authorId"),
                        "url": line.get("url"),
                        "name": line.get("name"),
                        "paperCount": line.get("paperCount"),
                        "hIndex": line.get("hIndex")
                        }
            # Write the row to CSV 1
            csv_writer_1.writerow(row_papers)
        except json.JSONDecodeError as e:
            print(f"Error decoding JSON: {e}")
print(f"Modified JSONL saved to {output_file}")
"""

https://api.semanticscholar.org/graph/v1/6432563/citations
{
  "error": "Not found"
}


'\nfor id in ids: \n    url = "https://api.semanticscholar.org/graph/v1/"+str(id)+"/citations"\n    print(url)\n\n    response = requests.get(url, headers=headers).json()\n    print(json.dumps(response, indent=2))\n# Save the results to json file\n\nwith open(output_file, "w", newline=\'\', encoding="utf-8") as outfile  :   \n    csv_writer_1 = csv.DictWriter(outfile, fieldnames=["sid","authorId", "url", "name", "paperCount", "hIndex"])\n\n    # Write the headers to the CSV files\n    csv_writer_1.writeheader()\n    for line in response:\n        count+=1\n        try:  \n            row_papers = {\n                        "sid": count, # Add a new column with a surrogated ID, just in case\n                        "authorId": line.get("authorId"),\n                        "url": line.get("url"),\n                        "name": line.get("name"),\n                        "paperCount": line.get("paperCount"),\n                        "hIndex": line.get("hIndex")\n                        

In [None]:
url = "https://api.semanticscholar.org/graph/v1/author/batch"
output_file="csv/authors.csv"
papers_file = "csv/papers.csv" 
count=0  
query_params = {
    "fields": "name,url,paperCount,hIndex"#,papers"
}

df=pd.read_csv(papers_file)
ids=df["authorId"].values.tolist()

data = {
    "ids": ids
}
api_key = os.environ.get('API_KEY')
headers = {"x-api-key": api_key}

# Send the API request
response = requests.post(url, params=query_params, json=data, headers=headers).json()
# Save the results to json file
with open(output_file, "w", newline='', encoding="utf-8") as outfile  :   
    csv_writer_1 = csv.DictWriter(outfile, fieldnames=["sid","authorId", "url", "name", "paperCount", "hIndex"])

    # Write the headers to the CSV files
    csv_writer_1.writeheader()
    for line in response:
        count+=1
        try:  
            row_papers = {
                        "sid": count, # Add a new column with a surrogated ID, just in case
                        "authorId": line.get("authorId"),
                        "url": line.get("url"),
                        "name": line.get("name"),
                        "paperCount": line.get("paperCount"),
                        "hIndex": line.get("hIndex")
                        }
            # Write the row to CSV 1
            csv_writer_1.writerow(row_papers)
        except json.JSONDecodeError as e:
            print(f"Error decoding JSON: {e}")
print(f"Modified JSONL saved to {output_file}")

JSONDecodeError: Expecting value: line 1 column 1 (char 0)