In [2]:
import requests, json, os, csv, re, urllib.parse
import pandas as pd
from dotenv import load_dotenv
from pathlib import Path
from contextlib import ExitStack

load_dotenv()

True

In [4]:
api_key = os.environ.get('API_KEY')
headers = {"x-api-key": api_key}

In [None]:
def is_valid_paper(paper):
    return paper['authors'] is not [] and paper['abstract'] is not None and paper["s2FieldOfStudy"] is not None


def is_valid_conference(paper):
    return paper["venue"] is not None


def is_valid_journal(paper):
    return paper["journal"] is not None and "name" in paper["journal"] and "pages" in paper["journal"] and "volume" in paper["journal"]

# CSV Needed
- paper (w abstract and relevant author) 
- paper-paper (n-n)
- author 
- paper-author (n-n)
- paper-reviewers (n-n)
- keywords
- paper-keywords (n-n)

In [18]:
csv_files = {
    "paper": ["sid","paperId","corpusId", "title", "referenceAuthorId", "url", "year", "referenceCount", "citationCount", "influentialCitationCount","publicationType", "publicationDate"],
    "paper_paper": ["sidPaper", "sidPaper"],
    "author": ["sid", "authorId", "name"],
    "paper_author": ["sidPaper", "sidAuthor"],
    "paper_reviewer": ["sidPaper", "sidReviewAuthor"],
    "keywords": ["sid", "keyword"],
    "paper_keywords": ["sidPaper", "sidKeyword"]
}

In [19]:
csv_folder = Path('csv')
with ExitStack() as stack:  # Ensures all files are closed properly
    files = {name: stack.enter_context(open(csv_folder / (name + '.csv'), "w", newline='', encoding="utf-8")) for name in csv_files}
    writers = {name: csv.DictWriter(files[name], fieldnames=fieldnames) for name, fieldnames in csv_files.items()}

    # Write headers for each CSV file
    for writer in writers.values():
        writer.writeheader()

In [None]:
#********************************************************************************************************************
RECORDS = 50  # Number of records to save per category 
QUERY = "data"  # Query to filter the papers
FIELDS = "paperId,corpusId,title,abstract,authors,url,year,referenceCount,citationCount,influentialCitationCount,s2FieldsOfStudy,publicationDate,journal,venue,publicationVenue"  # Fields to retrieve from the API
#********************************************************************************************************************

query_encoded = urllib.parse.quote(QUERY)
fields_encoded = urllib.parse.quote(FIELDS)

type_encoded = urllib.parse.quote("JournalArticle")
url1="https://api.semanticscholar.org/graph/v1/paper/search?query="+query_encoded+"&publicationTypes="+type_encoded+"&fields="+fields_encoded+"&limit="+str(RECORDS)
response = requests.get(url1, headers=headers).json()

In [15]:
data = response['data']
data[0]['authors']

[{'authorId': '2113243762', 'name': 'Hugo Touvron'},
 {'authorId': '51021910', 'name': 'M. Cord'},
 {'authorId': '3271933', 'name': 'Matthijs Douze'},
 {'authorId': '1403239967', 'name': 'Francisco Massa'},
 {'authorId': '3469062', 'name': 'Alexandre Sablayrolles'},
 {'authorId': '2065248680', 'name': "Herv'e J'egou"}]

In [None]:
output_file = "csv/papers.csv"  
keywords_file="csv/keywords.csv"

api_key = os.environ.get('API_KEY')
headers = {"x-api-key": api_key}

#********************************************************************************************************************
RECORDS = 50  # Number of records to save per category 
PUBLICATION_TYPES = ["Conference", "JournalArticle"]  # Publication types to filter (There is not Workshop)
QUERY = "data"  # Query to filter the papers
FIELDS = "paperId,corpusId,title,abstract,authors,url,year,referenceCount,citationCount,influentialCitationCount,s2FieldsOfStudy,publicationDate,journal,venue,publicationVenue"  # Fields to retrieve from the API
#********************************************************************************************************************

query_encoded = urllib.parse.quote(QUERY)
fields_encoded = urllib.parse.quote(FIELDS)

count = 0

with open(output_file, "w", newline='', encoding="utf-8") as outfile, open(keywords_file, "w", newline='', encoding="utf-8") as keyfile  :   
    csv_writer_1 = csv.DictWriter(outfile, fieldnames=["sid","paperId","corpusId", "title", "authorId", "authorName", "url", "year", "referenceCount", "citationCount", "influentialCitationCount","publicationType", "publicationDate"])
    csv_writer_2 = csv.DictWriter(keyfile, fieldnames=["sid","keyword"])
    # Write the headers to the CSV files
    csv_writer_1.writeheader()
    csv_writer_2.writeheader()
    for publication_type in PUBLICATION_TYPES:
        type_encoded = urllib.parse.quote(publication_type)
        url1="https://api.semanticscholar.org/graph/v1/paper/search?query="+query_encoded+"&publicationTypes="+type_encoded+"&fields="+fields_encoded+"&limit="+str(RECORDS)
        response = requests.get(url1, headers=headers).json()
        for line in response["data"]:
            count += 1
            try:  
                if not is_valid_paper():
                    continue
                if publication_type=="Conference":
                    if is_valid_conference(line):
                        line["publicationType"]="Conference"
                        line["journalName"]=None
                        line["journalVolume"]=None
                        line["journalPages"]=None
                    else:
                        continue
                elif publication_type=="JournalArticle":
                    if is_valid_journal(line):
                        line["publicationType"]="JournalArticle"
                        line["venue"]=None
                        line["journalName"]=line["journal"]["name"]
                        line["journalVolume"]=line["journal"]["volume"]
                        if line["journal"]["pages"] is not None:
                            line["journalPages"]=re.sub(r'\s+', '', line["journal"]["pages"])
                        else:
                            line["journalPages"]=None
                    else:
                        continue    
                line["authorId"] = line["authors"][0]["authorId"]
                line["authorName"] = line["authors"][0]["name"]
                row_papers = {
                    "sid": count, # Add a new column with a surrogated ID, just in case
                    "paperId": line.get("paperId"),
                    "corpusId": line.get("corpusId"),
                    "title":  line.get("title").strip().replace("\n", " "),
                    "authorId": line.get("authorId"),
                    "authorName": line.get("authorName"),
                    "url": line.get("url"),
                    "year": line.get("year"),
                    "referenceCount": line.get("referenceCount"),
                    "citationCount": line.get("citationCount"),
                    "influentialCitationCount": line.get("influentialCitationCount"),
                    "publicationType": line.get("publicationType"),
                    "publicationDate": line.get("publicationDate"),
                }
                csv_writer_1.writerow(row_papers)
                
                keywords=list()
                for keyword in line.get("s2FieldsOfStudy", []):
                    keywords.append(keyword["category"])
                keywords = list(set(keywords))
                for keyword in keywords:
                    row_keywords = {
                        "sid": count, # We will use the surrogated ID here, just in case
                        "keyword": keyword
                    }
                    # Write the row to CSV 2 for each keyword
                    csv_writer_2.writerow(row_keywords)
            except json.JSONDecodeError as e:
                print(f"Error decoding JSON: {e}")
print(f"Modified JSONL saved to {output_file}")


KeyboardInterrupt: 

In [133]:
output_file="csv/citations.csv"
papers_file = "csv/papers.csv"  

df=pd.read_csv(papers_file)
dic_cite=[]

api_key = os.environ.get('API_KEY')
headers = {"x-api-key": api_key}

ids=df["paperId"][0:3].tolist()


r = requests.post(
    'https://api.semanticscholar.org/graph/v1/paper/batch',
    params={'fields': 'references'},
    json={"ids": ids}
)

print(json.dumps(r.json(), indent=2))

"""
for paper_id, sid in zip(df["paperId"], df["sid"]): 
    url = "https://api.semanticscholar.org/graph/v1/paper/"+str(paper_id)+"/references?limit=10"
    response = requests.get(url, headers=headers).json()
    for line in response["data"]:
        dic_cite.append({"sid":sid,"citingPaperId":line["citedPaper"]["paperId"]})
        
print(dic_cite)


for id in ids: 
    url = "https://api.semanticscholar.org/graph/v1/"+str(id)+"/citations"
    print(url)

    response = requests.get(url, headers=headers).json()
    print(json.dumps(response, indent=2))
# Save the results to json file

with open(output_file, "w", newline='', encoding="utf-8") as outfile  :   
    csv_writer_1 = csv.DictWriter(outfile, fieldnames=["sid","authorId", "url", "name", "paperCount", "hIndex"])

    # Write the headers to the CSV files
    csv_writer_1.writeheader()
    for line in response:
        count+=1
        try:  
            row_papers = {
                        "sid": count, # Add a new column with a surrogated ID, just in case
                        "authorId": line.get("authorId"),
                        "url": line.get("url"),
                        "name": line.get("name"),
                        "paperCount": line.get("paperCount"),
                        "hIndex": line.get("hIndex")
                        }
            # Write the row to CSV 1
            csv_writer_1.writerow(row_papers)
        except json.JSONDecodeError as e:
            print(f"Error decoding JSON: {e}")
print(f"Modified JSONL saved to {output_file}")
"""

[
  {
    "paperId": "ad7ddcc14984caae308c397f1a589aae75d4ab71",
    "references": [
      {
        "paperId": "cec7872b194aadf54140578b9be52939eb1112e9",
        "title": "LambdaNetworks: Modeling Long-Range Interactions Without Attention"
      },
      {
        "paperId": "b52431a4268bd2f848db4a0c8c614dc1e687eeab",
        "title": "Grafit: Learning fine-grained image representations with coarse labels"
      },
      {
        "paperId": "268d347e8a55b5eb82fb5e7d2f800e33c75ab18a",
        "title": "An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale"
      },
      {
        "paperId": "64b9be00f4eecd465b4e8e46e2ab7624d7eaeb2b",
        "title": "Global Self-Attention Networks for Image Recognition"
      },
      {
        "paperId": "867ec3a4837213d0096fec75aa6d1dbbfd2c4b1d",
        "title": "Feature Space Augmentation for Long-Tailed Data"
      },
      {
        "paperId": "bc022dbb37b1bbf3905a7404d19c03ccbf6b81a8",
        "title": "Generative Pretra

'\nfor paper_id, sid in zip(df["paperId"], df["sid"]): \n    url = "https://api.semanticscholar.org/graph/v1/paper/"+str(paper_id)+"/references?limit=10"\n    response = requests.get(url, headers=headers).json()\n    for line in response["data"]:\n        dic_cite.append({"sid":sid,"citingPaperId":line["citedPaper"]["paperId"]})\n        \nprint(dic_cite)\n\n\nfor id in ids: \n    url = "https://api.semanticscholar.org/graph/v1/"+str(id)+"/citations"\n    print(url)\n\n    response = requests.get(url, headers=headers).json()\n    print(json.dumps(response, indent=2))\n# Save the results to json file\n\nwith open(output_file, "w", newline=\'\', encoding="utf-8") as outfile  :   \n    csv_writer_1 = csv.DictWriter(outfile, fieldnames=["sid","authorId", "url", "name", "paperCount", "hIndex"])\n\n    # Write the headers to the CSV files\n    csv_writer_1.writeheader()\n    for line in response:\n        count+=1\n        try:  \n            row_papers = {\n                        "sid": co

In [None]:
url = "https://api.semanticscholar.org/graph/v1/author/batch"
output_file="csv/authors.csv"
papers_file = "csv/papers.csv" 
count=0  
query_params = {
    "fields": "name,url,paperCount,hIndex"#,papers"
}

df=pd.read_csv(papers_file)
ids=df["authorId"].values.tolist()

data = {
    "ids": ids
}
api_key = os.environ.get('API_KEY')
headers = {"x-api-key": api_key}

# Send the API request
response = requests.post(url, params=query_params, json=data, headers=headers).json()
# Save the results to json file
with open(output_file, "w", newline='', encoding="utf-8") as outfile  :   
    csv_writer_1 = csv.DictWriter(outfile, fieldnames=["sid","authorId", "url", "name", "paperCount", "hIndex"])

    # Write the headers to the CSV files
    csv_writer_1.writeheader()
    for line in response:
        count+=1
        try:  
            row_papers = {
                        "sid": count, # Add a new column with a surrogated ID, just in case
                        "authorId": line.get("authorId"),
                        "url": line.get("url"),
                        "name": line.get("name"),
                        "paperCount": line.get("paperCount"),
                        "hIndex": line.get("hIndex")
                        }
            # Write the row to CSV 1
            csv_writer_1.writerow(row_papers)
        except json.JSONDecodeError as e:
            print(f"Error decoding JSON: {e}")
print(f"Modified JSONL saved to {output_file}")

Modified JSONL saved to csv/authors.csv
