In [25]:
import requests, json, os, csv, re, urllib.parse
import pandas as pd
from dotenv import load_dotenv
from time import sleep
from pathlib import Path
from contextlib import ExitStack
from datetime import datetime

load_dotenv()

True

In [2]:
api_key = os.environ.get('API_KEY')
headers = {"x-api-key": api_key}

In [3]:
def is_id_valid(paper):
    return paper is not None and paper['paperId'] is not None

def is_valid_paper(paper):
    return paper['authors'] and paper['abstract'] is not None and paper['title'] is not None and paper['year'] is not None and paper['publicationTypes'] is not None


def is_valid_conference(paper):
    return "Conference" in paper["publicationTypes"] and paper["venue"] is not None and paper["publicationVenue"] is not None


def is_valid_journal(paper):
    return "JournalArticle" in paper["publicationTypes"] and paper["publicationVenue"] is not None and paper["journal"] is not None and "name" in paper["journal"] and "pages" in paper["journal"] and "volume" in paper["journal"]


def get_referencing_author_id(authors):
    return authors[0]['authorId']

# CSV Needed
- paper (w abstract and relevant author) 
- paper-paper (n-n)
- author 
- paper-author (n-n)
- paper-reviewers (n-n)
- keywords
- paper-keywords (n-n)
- conference (1-n)
- journal (1-n)
- year (1-n)

In [4]:
csv_files = {
    "paper": ["paperId","corpusId", "title", "referenceAuthorId", "abstract", "url", "publicationType", "publicationDate","journalId", "conferenceId","yearId"],
    "paper_paper": ["citingPaperId", "citedPaperId"],
    "author": ["authorId", "authorName"],
    "paper_author": ["paperId", "authorId"],
    "paper_reviewer": ["paperId", "reviewAuthorId"],
    "keywords": ["keyword"],
    "paper_keywords": ["paperId", "keyword"],
    "conference": ["conferenceId", "conferenceName", "yearId"],
    "journal": ["journalId", "journalName", "journalPages", "journalVolume","yearId"],
}

In [6]:
#This is synthetic (Not worthy to generate it)
years = list(range(1950, datetime.now().year + 1))
ids = [year-1950 for year in years]
# Create a DataFrame
df = pd.DataFrame({"ids":ids, "Year": years})

# Save to CSV
df.to_csv("csv/years.csv", index=False)

print("CSV file 'years.csv' created successfully!")

CSV file 'years.csv' created successfully!


In [7]:
#********************************************************************************************************************
RECORDS = 100  # Number of records to save per category 
QUERY = "semantic data modelling and property graphs"  # Query to filter the papers
FIELDS = "paperId,corpusId,title,abstract,authors,url,year,s2FieldsOfStudy,publicationDate,publicationTypes,journal,venue,publicationVenue,references.paperId"  # Fields to retrieve from the API
#********************************************************************************************************************

query_encoded = urllib.parse.quote(QUERY)
fields_encoded = urllib.parse.quote(FIELDS)
type_encoded = urllib.parse.quote("Conference,JournalArticle")

starting_papers_url="https://api.semanticscholar.org/graph/v1/paper/search?query="+query_encoded+"&publicationTypes="+type_encoded+"&fields=paperId&limit="+str(RECORDS)
response = requests.get(starting_papers_url, headers=headers).json()
starting_papers = response["data"]


In [8]:
def process_new_papers(processed_papers, to_be_processed_papers, processing_papers, new_papers):
    new_papers.discard(None)
    for paper in new_papers:
        if paper not in processed_papers and paper not in to_be_processed_papers and paper not in processing_papers:
            to_be_processed_papers.add(paper)


def choose_n_papers_to_process(to_be_processed_papers, n):
    return {to_be_processed_papers.pop() for _ in range(min(n, len(to_be_processed_papers)))}

In [9]:
BATCH_SIZE = 200
MAX_RECURSION = 10
csv_folder = Path('csv')

processed_papers = set()
to_be_processed_papers = set()
starting_papers_ids = set([paper['paperId'] for paper in starting_papers])
process_new_papers(processed_papers, to_be_processed_papers, set(), starting_papers_ids)

set_authors = set()
set_keywords = set()
set_papers = set()
set_joutnals = set()
set_conferences = set()

with ExitStack() as stack:  # Ensures all files are closed properly
    files = {name: stack.enter_context(open(csv_folder / (name + '.csv'), "w", newline='', encoding="utf-8")) for name in csv_files}
    writers = {name: csv.DictWriter(files[name], fieldnames=fieldnames, delimiter="|") for name, fieldnames in csv_files.items()}
    recursion_block = 0

    # Header has to be removed for tables and changed for relationships! Ask Alfio
    for writer in writers.values():
        writer.writeheader()

    while to_be_processed_papers:
        recursion_block+=1
        if recursion_block > MAX_RECURSION:
            break

        processing_papers_id = choose_n_papers_to_process(to_be_processed_papers, BATCH_SIZE)

        processing_papers_data = requests.post(
            'https://api.semanticscholar.org/graph/v1/paper/batch',
            params={'fields': FIELDS},
            json={"ids": list(processing_papers_id)},
            headers=headers
        ).json()
        
        for paper in processing_papers_data:
            try:  
                if not is_id_valid(paper):
                    continue   
                processed_papers.add(paper['paperId'])
                if not is_valid_paper(paper):
                    continue
                if is_valid_conference(paper):
                    paper["publicationType"]="Conference"
                    paper["journalName"]=None
                    paper["journalVolume"]=None
                    paper["journalPages"]=None
                    paper["conferenceId"]=paper["publicationVenue"]["id"]
                    paper["journalId"]=None
                    if paper["conferenceId"]+str(paper["year"]) not in set_conferences:
                        set_conferences.add(paper["conferenceId"]+str(paper["year"]))
                        writers['conference'].writerow({
                        "conferenceId": paper["conferenceId"],
                        "conferenceName": paper["publicationVenue"]["name"],
                        "yearId": paper["year"]-1950,
                        })
                    
                    
                elif is_valid_journal(paper):
                    paper["publicationType"]="JournalArticle"
                    paper["journalId"]=paper["publicationVenue"]["id"]
                    paper["venue"]=None
                    paper["conferenceId"]=None
                    paper["journalName"]=paper["journal"]["name"]
                    paper["journalVolume"]=paper["journal"]["volume"]
                    if paper["journal"]["pages"] is not None:
                        paper["journalPages"]=re.sub(r'\s+', '', paper["journal"]["pages"])
                    else:
                        paper["journalPages"]=None
                    if paper["journalId"]+str(paper["year"]) not in set_joutnals:
                        set_joutnals.add(paper["journalId"]+str(paper["year"]))
                        writers['journal'].writerow({
                        "journalId": paper["journalId"],
                        "journalName": paper["journalName"],
                        "journalVolume": paper["journalVolume"],
                        "journalPages": paper["journalPages"], 
                        "yearId": paper["year"]-1950,
                        })
                    
                else:
                    continue   
                paperId = paper.get("paperId")
                paper_authors = paper["authors"]
                writers['paper'].writerow({
                    "paperId": paperId,
                    "corpusId": paper.get("corpusId"),
                    "title":  paper.get("title").strip().replace("\n", " ").replace("|", " ").replace('"', "").replace("^", " "),
                    "referenceAuthorId": get_referencing_author_id(paper_authors),
                    "abstract": paper.get("abstract").strip().replace("\n", " ").replace("|", " ").replace('"', "").replace("^", " "),
                    "url": paper.get("url"),
                    "yearId": paper.get("year")-1950,
                    "publicationType": paper.get("publicationType"),
                    "publicationDate": paper.get("publicationDate"),
                    "journalId": paper.get("journalId"),
                    "conferenceId": paper.get("conferenceId")
                })

                new_papers = set([paper['paperId'] for paper in paper['references']])
                process_new_papers(processed_papers, to_be_processed_papers, processing_papers_id, new_papers)

                for new_paper in new_papers:
                    writers['paper_paper'].writerow({
                        "citingPaperId": paperId,
                        "citedPaperId": new_paper,
                    })

                for author in paper_authors:
                    authorId = author.get("authorId")
                    writers['paper_author'].writerow({
                        "paperId": paperId,
                        "authorId": authorId
                    })

                    if authorId not in set_authors:
                        writers['author'].writerow({
                            "authorId": authorId,
                            "authorName": author.get("name")
                        })

                        set_authors.add(authorId)
            
                paper_keywords = paper.get("s2FieldsOfStudy", [])
                paper_keywords = set(map(lambda x: x['category'], paper_keywords))

                for keyword in paper_keywords:
                    writers["paper_keywords"].writerow({
                            "paperId": paperId,
                            "keyword": keyword
                        })
                    
                    if keyword not in set_keywords:
                        writers["keywords"].writerow({
                            "keyword": keyword
                        })

                        set_keywords.add(keyword)
                
            except json.JSONDecodeError as e:
                print(f"Error decoding JSON: {e}")


In [16]:
#If the numbers are the same means that every paper has a different event, we want to have groups!!
import pandas as pd
papers_file = "csv/paper.csv"
conferences_file = "csv/conference.csv"
journals_file = "csv/journal.csv"

df=pd.read_csv(papers_file, delimiter='|')
df_conferences=pd.read_csv(conferences_file, delimiter='|')
df_journals=pd.read_csv(journals_file, delimiter='|')

print("Number of lines in papers:", len(df))
print("Number of lines in events:", len(df_conferences)+len(df_journals))

Number of lines in papers: 840
Number of lines in events: 504


In [11]:
#Cleaning references :)
citations_file="csv/paper_paper.csv"

df_citations=pd.read_csv(citations_file, delimiter='|')
print("Number of references:", len(df_citations))
df_citations_new = df_citations[df_citations["citedPaperId"].isin(df['paperId'].to_list())]
print("Number of references after cleaning:", len(df_citations_new))
df_citations_new.to_csv("csv/paper_paper.csv", sep='|', index=False)

Number of references: 34455
Number of references after cleaning: 2144


In [None]:
"""
import openai

openai.api_key = os.environ.get('GPT_KEY')
# Sample function to fetch the host country from GPT
def get_host_country(conference, year):
    prompt = f"Where was the {conference} conference hosted in {year}?"
    
    try:
        response = openai.completions.create(
            model="gpt-3",
            prompt=prompt,
            max_tokens=50
        )
        return response["choices"][0]["text"].strip()  # Adjusted field name
    except Exception as e:
        print(f"Error fetching data for {conference} in {year}: {e}")
        return None

# Example usage
response = get_host_country("Knowledge Discovery and Data Mining", 2024)
print(response)
"""



"""
openai.api_key = os.environ.get('GPT_KEY')

# Function to query GPT for the host country
def get_host_country(conference, year):
    prompt = f"Where was the {conference} conference hosted in {year}?"
    
    try:
        response = openai.ChatCompletion.create(
            model="gpt-4",
            messages=[{"role": "user", "content": prompt}]
        )
        return response["choices"][0]["message"]["content"]
    
    except Exception as e:
        print(f"Error fetching data for {conference} in {year}: {e}")
        return None

# Iterate through the DataFrame and call the GPT API
df_conferences['Country'] = df_conferences.apply(lambda row: get_host_country(row["conferenceName"], row['yearId']+1950), axis=1)

df_conferences.to_csv("csv/conference.csv", sep='|', index=False)
"""

Error fetching data for Knowledge Discovery and Data Mining in 2024: Error code: 404 - {'error': {'message': 'The model `gpt-3` does not exist or you do not have access to it.', 'type': 'invalid_request_error', 'param': None, 'code': 'model_not_found'}}
None


'\nopenai.api_key = os.environ.get(\'GPT_KEY\')\n\n# Function to query GPT for the host country\ndef get_host_country(conference, year):\n    prompt = f"Where was the {conference} conference hosted in {year}?"\n    \n    try:\n        response = openai.ChatCompletion.create(\n            model="gpt-4",\n            messages=[{"role": "user", "content": prompt}]\n        )\n        return response["choices"][0]["message"]["content"]\n    \n    except Exception as e:\n        print(f"Error fetching data for {conference} in {year}: {e}")\n        return None\n\n# Iterate through the DataFrame and call the GPT API\ndf_conferences[\'Country\'] = df_conferences.apply(lambda row: get_host_country(row["conferenceName"], row[\'yearId\']+1950), axis=1)\n\ndf_conferences.to_csv("csv/conference.csv", sep=\'|\', index=False)\n'

In [None]:
url = "https://api.semanticscholar.org/graph/v1/author/batch"
output_file="csv/authors.csv"
papers_file = "csv/papers.csv" 
count=0  
query_params = {
    "fields": "name,url,paperCount,hIndex"#,papers"
}

df=pd.read_csv(papers_file)
ids=df["authorId"].values.tolist()

data = {
    "ids": ids
}
api_key = os.environ.get('API_KEY')
headers = {"x-api-key": api_key}

# Send the API request
response = requests.post(url, params=query_params, json=data, headers=headers).json()
# Save the results to json file
with open(output_file, "w", newline='', encoding="utf-8") as outfile  :   
    csv_writer_1 = csv.DictWriter(outfile, fieldnames=["sid","authorId", "url", "name", "paperCount", "hIndex"])

    # Write the headers to the CSV files
    csv_writer_1.writeheader()
    for paper in response:
        count+=1
        try:  
            paper_row = {
                        "sid": count, # Add a new column with a surrogated ID, just in case
                        "authorId": paper.get("authorId"),
                        "url": paper.get("url"),
                        "name": paper.get("name"),
                        "paperCount": paper.get("paperCount"),
                        "hIndex": paper.get("hIndex")
                        }
            # Write the row to CSV 1
            csv_writer_1.writerow(paper_row)
        except json.JSONDecodeError as e:
            print(f"Error decoding JSON: {e}")
print(f"Modified JSONL saved to {output_file}")

Modified JSONL saved to csv/authors.csv
