In [196]:
import pandas as pd
import numpy as np
from os.path import join
import requests
import sqlite3
import os

In [197]:
from peewee import SqliteDatabase
from playhouse.reflection import generate_models, print_model, print_table_sql

In [198]:
HEADERS = {
    "x-api-key": os.environ.get("S2_API_KEY")
}

In [199]:
connection = sqlite3.connect("star.db")

In [200]:
cursor = connection.cursor()
cursor.execute("DROP TABLE papers")
cursor.execute("DROP TABLE domain_paper_to_field")
cursor.execute("DROP TABLE domain_paper_to_method")
cursor.execute("CREATE TABLE papers (ss_id TEXT, doi TEXT, title TEXT, year INTEGER, citation_count INTEGER, venue TEXT, is_domain_full INTEGER, is_domain_partial INTEGER, is_preprint INTEGER, is_method_primary INTEGER, is_method_secondary INTEGER, has_dr_vis INTEGER)")
cursor.execute("CREATE TABLE domain_paper_to_field (ss_id TEXT, field TEXT)")
cursor.execute("CREATE TABLE domain_paper_to_method (ss_id TEXT, method_acronym TEXT, method_ss_id TEXT)")

<sqlite3.Cursor at 0x29dc61c40>

In [201]:
db = SqliteDatabase('star.db')
models = generate_models(db)

globals().update({
    "Paper": models['papers'],
    "DomainPaperToField": models['domain_paper_to_field'],
    "DomainPaperToMethod": models['domain_paper_to_method']
})

In [202]:
df = pd.read_csv("data/dr_algorithms_with_ids.csv", index_col=0)

In [203]:
df = df.loc[pd.notna(df['ss_id'])]

In [204]:
df = df[['Acronym', 'ss_id']].reset_index(drop=True)
df.head()

Unnamed: 0,Acronym,ss_id
0,AE,02552a8b40f3a82a5353f596264db71d899a9b4a
1,CCA,085049bec04020baecdb17bec196442a4abaedab
2,CHL,0f73d5a2669143d6fcbf4db0a4a9495bab115eb8
3,CLM,8d70b1981fc2947867a33f20256e88dc87fbb422
4,CuCA,bfb5410593385a279cc62844bd395f744dec2302


In [205]:
def get_single_paper(paper_id):
    fields = "title,authors,year,fieldsOfStudy,s2FieldsOfStudy,venue,externalIds,citationCount,openAccessPdf"
    r_paper = requests.get(
        f'https://api.semanticscholar.org/graph/v1/paper/{paper_id}?fields={fields}',
        headers=HEADERS
    )
    r_dict = r_paper.json()
    return r_dict

In [206]:
def get_citing_papers(paper_id):
    fields = "title,authors,year,fieldsOfStudy,s2FieldsOfStudy,venue,externalIds,citationCount,openAccessPdf"
    limit = 1000
    offset = 0
    has_next = True
    all_data = []
    while has_next:
        if offset == 9000:
            limit = 999
        r_paper = requests.get(
            f'https://api.semanticscholar.org/graph/v1/paper/{paper_id}/citations?fields={fields}&limit={limit}&offset={offset}',
            headers=HEADERS
        )
        r_dict = r_paper.json()
        if "data" in r_dict:
            all_data += r_dict["data"]
        if "next" in r_dict:
            offset += limit
            has_next = True
        else:
            has_next = False
    return all_data

In [207]:
def get_doi(p_info):
    return (None if ('externalIds' not in p_info or p_info['externalIds'] is None) else p_info['externalIds'].get('DOI', None))

In [208]:
def clean_field(field):
    if field == "Agricultural and Food Sciences":
        return 'Agricultural And Food Sciences'
    return field

In [209]:
def is_preprint_venue(venue):
    if venue is None:
        return 0
    return (1 if (venue in [
        "bioRxiv",
        "medRxiv",
        "arXiv.org",
        "PeerJ Preprints"
    ]) else 0)

In [210]:
def is_domain_partial(fields):
    # If the fields are fully a subset of { CS, Math, Eng }
    return (fields is not None and not set(fields).issubset({ "Computer Science", "Mathematics", "Engineering" }))

In [211]:
def is_domain_full(fields):
    # If the fields do not contain any of CS, Math, or Eng
    return (fields is not None and ("Computer Science" not in fields and "Mathematics" not in fields and "Engineering" not in fields))

In [212]:
method_info

{'message': 'Network error communicating with endpoint'}

In [219]:
for row_i, row in df.iterrows():
    method_ss_id = row['ss_id']
    method_acronym = row['Acronym']

    method_info = get_single_paper(method_ss_id)
    if 'title' not in method_info:
        print(method_info['message'], row_i)
    else:
        paper_obj = Paper(
            ss_id=method_ss_id,
            doi=get_doi(method_info),
            title=method_info['title'],
            year=(int(method_info['year']) if method_info['year'] is not None else None),
            citation_count=method_info['citationCount'],
            venue=method_info['venue'],
            is_preprint=is_preprint_venue(method_info['venue']),
            is_domain_full=0,
            is_domain_partial=0,
            is_method_primary=1,
            is_method_secondary=0,
            has_dr_vis=0
        )
        paper_obj.save()
    
        citing_papers = get_citing_papers(method_ss_id)
        for citing_paper_dict in citing_papers:
            cp_info = citing_paper_dict["citingPaper"]
            fields = cp_info["fieldsOfStudy"]
            s2_fields = cp_info["s2FieldsOfStudy"]
            if fields is None and s2_fields is not None:
                fields = [ d["category"] for d in s2_fields ]
            year = cp_info["year"]
            if year is not None and year >= 2013:
                is_domain = is_domain_partial(fields)
                paper_obj = Paper(
                    ss_id=cp_info['paperId'],
                    doi=get_doi(cp_info),
                    title=cp_info['title'],
                    year=int(cp_info['year']),
                    citation_count=cp_info['citationCount'],
                    venue=cp_info['venue'],
                    is_preprint=is_preprint_venue(cp_info['venue']),
                    is_domain_full=is_domain_full(fields),
                    is_domain_partial=is_domain,
                    is_method_primary=0,
                    is_method_secondary=0,
                    has_dr_vis=0
                )
                paper_obj.save()
    
                if fields is not None:
                    for field in fields:
                        dptf_obj = DomainPaperToField(
                            ss_id=cp_info['paperId'],
                            field=clean_field(field),
                        )
                        dptf_obj.save()
    
                dptm_obj = DomainPaperToMethod(
                    ss_id=cp_info['paperId'],
                    method_acronym=method_acronym,
                    method_ss_id=method_ss_id,
                )
                dptm_obj.save()