In [29]:
import requests, json, os, csv, re, urllib.parse
import pandas as pd
from dotenv import load_dotenv
from time import sleep
from pathlib import Path
from contextlib import ExitStack

load_dotenv()

True

In [3]:
api_key = os.environ.get('API_KEY')
headers = {"x-api-key": api_key}

In [60]:
def is_valid_paper(paper):
    return paper['paperId'] is not None and paper['authors'] and paper['abstract'] is not None and paper['title'] is not None and paper['year'] is not None


def is_valid_conference(paper):
    return paper["venue"] is not None


def is_valid_journal(paper):
    return paper["journal"] is not None and "name" in paper["journal"] and "pages" in paper["journal"] and "volume" in paper["journal"]


def get_referencing_author_id(authors):
    return authors[0]['authorId']

# CSV Needed
- paper (w abstract and relevant author) 
- paper-paper (n-n)
- author 
- paper-author (n-n)
- paper-reviewers (n-n)
- keywords
- paper-keywords (n-n)

In [5]:
csv_files = {
    "paper": ["paperId","corpusId", "title", "referenceAuthorId", "abstract", "url", "year", "publicationType", "publicationDate"],
    "paper_paper": ["citingPaperId", "citedPaperId"],
    "author": ["authorId", "authorName"],
    "paper_author": ["paperId", "authorId"],
    "paper_reviewer": ["paperId", "reviewAuthorId"],
    "keywords": ["keyword"],
    "paper_keywords": ["paperId", "keyword"]
}

In [62]:
#********************************************************************************************************************
RECORDS = 100  # Number of records to save per category 
QUERY = "data"  # Query to filter the papers
FIELDS = "paperId,corpusId,title,abstract,authors,url,year,s2FieldsOfStudy,publicationDate,journal,venue,publicationVenue,references.paperId"  # Fields to retrieve from the API
#********************************************************************************************************************

query_encoded = urllib.parse.quote(QUERY)
fields_encoded = urllib.parse.quote(FIELDS)
type_encoded = urllib.parse.quote("Conference,JournalArticle")

starting_papers_url="https://api.semanticscholar.org/graph/v1/paper/search?query="+query_encoded+"&publicationTypes="+type_encoded+"&fields=paperId&limit="+str(RECORDS)
response = requests.get(starting_papers_url, headers=headers).json()
starting_papers = response["data"]


In [63]:
def process_new_papers(processed_papers, to_be_processed_papers, processing_papers, new_papers):
    new_papers.discard(None)
    for paper in new_papers:
        if paper not in processed_papers and paper not in to_be_processed_papers and paper not in processing_papers:
            to_be_processed_papers.add(paper)


def choose_n_papers_to_process(to_be_processed_papers, n):
    return {to_be_processed_papers.pop() for _ in range(min(n, len(to_be_processed_papers)))}

In [None]:
BATCH_SIZE = 200
MAX_RECURSION = 10
csv_folder = Path('csv')

processed_papers = set()
to_be_processed_papers = set()
starting_papers_ids = set([paper['paperId'] for paper in starting_papers])
process_new_papers(processed_papers, to_be_processed_papers, set(), starting_papers_ids)

set_authors = set()
set_keywords = set()
set_papers = set()

with ExitStack() as stack:  # Ensures all files are closed properly
    files = {name: stack.enter_context(open(csv_folder / (name + '.csv'), "w", newline='', encoding="utf-8")) for name in csv_files}
    writers = {name: csv.DictWriter(files[name], fieldnames=fieldnames, delimiter="|") for name, fieldnames in csv_files.items()}
    recursion_block = 0

    for writer in writers.values():
        writer.writeheader()

    while to_be_processed_papers:
        recursion_block+=1
        if recursion_block > MAX_RECURSION:
            break

        processing_papers_id = choose_n_papers_to_process(to_be_processed_papers, BATCH_SIZE)

        processing_papers_data = requests.post(
            'https://api.semanticscholar.org/graph/v1/paper/batch',
            params={'fields': FIELDS},
            json={"ids": list(processing_papers_id)},
            headers=headers
        ).json()
        
        for paper in processing_papers_data:
            try:  
                processed_papers.add(paper['paperId'])
                if not is_valid_paper(paper):
                    continue
                if is_valid_conference(paper):
                    paper["publicationType"]="Conference"
                    paper["journalName"]=None
                    paper["journalVolume"]=None
                    paper["journalPages"]=None
                elif is_valid_journal(paper):
                    paper["publicationType"]="JournalArticle"
                    paper["venue"]=None
                    paper["journalName"]=paper["journal"]["name"]
                    paper["journalVolume"]=paper["journal"]["volume"]
                    if paper["journal"]["pages"] is not None:
                        paper["journalPages"]=re.sub(r'\s+', '', paper["journal"]["pages"])
                    else:
                        paper["journalPages"]=None
                else:
                    continue    
                paperId = paper.get("paperId")
                paper_authors = paper["authors"]

                writers['paper'].writerow({
                    "paperId": paperId,
                    "corpusId": paper.get("corpusId"),
                    "title":  paper.get("title").strip().replace("\n", " ").replace("|", " ").replace('"', "").replace("^", " "),
                    "referenceAuthorId": get_referencing_author_id(paper_authors),
                    "abstract": paper.get("abstract").strip().replace("\n", " ").replace("|", " ").replace('"', "").replace("^", " "),
                    "url": paper.get("url"),
                    "year": paper.get("year"),
                    "publicationType": paper.get("publicationType"),
                    "publicationDate": paper.get("publicationDate"),
                })

                new_papers = set([paper['paperId'] for paper in paper['references']])
                process_new_papers(processed_papers, to_be_processed_papers, processing_papers_id, new_papers)

                for new_paper in new_papers:
                    writers['paper_paper'].writerow({
                        "citingPaperId": paperId,
                        "citedPaperId": new_paper,
                    })

                for author in paper_authors:
                    authorId = author.get("authorId")
                    writers['paper_author'].writerow({
                        "paperId": paperId,
                        "authorId": authorId
                    })

                    if authorId not in set_authors:
                        writers['author'].writerow({
                            "authorId": authorId,
                            "authorName": author.get("name")
                        })

                        set_authors.add(authorId)
            
                paper_keywords = paper.get("s2FieldsOfStudy", [])
                paper_keywords = set(map(lambda x: x['category'], paper_keywords))

                for keyword in paper_keywords:
                    writers["paper_keywords"].writerow({
                            "paperId": paperId,
                            "keyword": keyword
                        })
                    
                    if keyword not in set_keywords:
                        writers["keywords"].writerow({
                            "keyword": keyword
                        })

                        set_keywords.add(keyword)
                
            except json.JSONDecodeError as e:
                print(f"Error decoding JSON: {e}")


In [72]:
output_file="csv/citations.csv"
papers_file = "csv/paper.csv"  

df=pd.read_csv(papers_file, delimiter='|')
dic_cite=[]

df

Unnamed: 0,paperId,corpusId,title,referenceAuthorId,abstract,url,year,publicationType,publicationDate
0,948fd800ecdd3c99488dde36b41480ca1b8acce3,53214999,The PRIDE database and related tools and resou...,1.390052e+09,Abstract The PRoteomics IDEntifications (PRIDE...,https://www.semanticscholar.org/paper/948fd800...,2018,Conference,2018-11-05
1,59c9f2036e673d8bc9713eed851d12c6c9fe53cb,9267632,A universal algorithm for sequential data comp...,1.457204e+08,A universal algorithm for sequential data comp...,https://www.semanticscholar.org/paper/59c9f203...,1977,Conference,1977-05-01
2,422876e542daadefe3371091a65c5671185796e2,5131034,Attribute-based encryption for fine-grained ac...,1.707396e+06,As more sensitive data is shared and stored by...,https://www.semanticscholar.org/paper/422876e5...,2006,Conference,2006-10-30
3,29858b40a15704398aecdca6bd2820f2fcc99891,219636053,Training Generative Adversarial Networks with ...,2.976930e+06,Training generative adversarial networks (GAN)...,https://www.semanticscholar.org/paper/29858b40...,2020,Conference,2020-06-01
4,f117c6f12d067bd66dad40996b3931c069daa2da,18293514,Business Intelligence and Analytics: From Big ...,4.766666e+07,Business intelligence and analytics (BI&A) has...,https://www.semanticscholar.org/paper/f117c6f1...,2012,Conference,2012-12-01
...,...,...,...,...,...,...,...,...,...
1259,cfd538dd2998ede6599b45b27a4f7b72b4c99adf,207197892,Self-indexing inverted files for fast text ret...,1.444485e+08,Query-processing costs on large text databases...,https://www.semanticscholar.org/paper/cfd538dd...,1996,Conference,1996-10-01
1260,5f14a9595b0796ce6e5338f157b763326c1f632f,11757052,Tom-vs-Pete Classifiers and Identity-Preservin...,2.053403e+09,We propose a method of face verification that ...,https://www.semanticscholar.org/paper/5f14a959...,2012,Conference,
1261,7f4a6d9806b1329706756634d082cb56348b44b7,16145112,Mosaics of scenes with moving objects,2.111092e+09,Image mosaics are useful for a variety of task...,https://www.semanticscholar.org/paper/7f4a6d98...,1998,Conference,1998-06-23
1262,ed2ffda943ddde6d06d3298bdcbe62633d4c20de,24862145,Recombinant immunotoxins specific for a mutant...,2.243822e+09,EGFRvIII is a mutant epidermal growth factor r...,https://www.semanticscholar.org/paper/ed2ffda9...,1996,Conference,1996-12-10


In [27]:
df['paperId'].to_list()

['2287a3930a7568a956aae5f3f037efe8fed675e7',
 'ab9b7934d62c79c16e8792be580e22dc7aebc967',
 '948fd800ecdd3c99488dde36b41480ca1b8acce3',
 '114d9e30d388fa5b74797b864d092a0ee63e5b27',
 '44787913722b856c22c9c81ded1c735c24ad4de4',
 'cc90910b6e31fe44cddc1e341f21eec0aaa5db44',
 'a992c8fd24587f12d41f48df84b29d847634c0e4',
 '7e7343a5608fff1c68c5259db0c77b9193f1546d',
 '627be67feb084f1266cfc36e5aed3c3e7e6ce5f0',
 '8bb6cc4057c0f76e963f19918a79697acbd2bc41',
 'fa02f9123abacd5ba13d41e937d99c077da8d3f6',
 '0e6ef2809bad0965b3df599b05be1e6d859d5543',
 '9f1b79d77201016a2579c013c5d490a12ab596b7',
 '6d8c9fcce8177d6f8d122d653c7d32d7624d6714',
 '67556c4f0cfdd1f09fff373768b03638f949be0d',
 'e7c8aa2cb2223f17615c1b1ae3b33095466e95cc',
 '05e25b9797de9a544ca50e743aa2a15eb129ea72',
 '4e2f43dab69d690dc86422949e410ebf37f522d4',
 '6e00f7980c4efc55ba76efdccebc6411f054a7da',
 'dcd99d49af33bd14e9e0750bcf854e7b306c808a',
 '82d02d8c697119a879756b5393c4aa5defeaa030']

In [None]:
url = "https://api.semanticscholar.org/graph/v1/author/batch"
output_file="csv/authors.csv"
papers_file = "csv/papers.csv" 
count=0  
query_params = {
    "fields": "name,url,paperCount,hIndex"#,papers"
}

df=pd.read_csv(papers_file)
ids=df["authorId"].values.tolist()

data = {
    "ids": ids
}
api_key = os.environ.get('API_KEY')
headers = {"x-api-key": api_key}

# Send the API request
response = requests.post(url, params=query_params, json=data, headers=headers).json()
# Save the results to json file
with open(output_file, "w", newline='', encoding="utf-8") as outfile  :   
    csv_writer_1 = csv.DictWriter(outfile, fieldnames=["sid","authorId", "url", "name", "paperCount", "hIndex"])

    # Write the headers to the CSV files
    csv_writer_1.writeheader()
    for paper in response:
        count+=1
        try:  
            paper_row = {
                        "sid": count, # Add a new column with a surrogated ID, just in case
                        "authorId": paper.get("authorId"),
                        "url": paper.get("url"),
                        "name": paper.get("name"),
                        "paperCount": paper.get("paperCount"),
                        "hIndex": paper.get("hIndex")
                        }
            # Write the row to CSV 1
            csv_writer_1.writerow(paper_row)
        except json.JSONDecodeError as e:
            print(f"Error decoding JSON: {e}")
print(f"Modified JSONL saved to {output_file}")

Modified JSONL saved to csv/authors.csv
