In [49]:
import pandas as pd
import numpy as np
from os.path import join
import requests
import sqlite3

In [50]:
from peewee import SqliteDatabase
from playhouse.reflection import generate_models, print_model, print_table_sql

In [51]:
connection = sqlite3.connect("star.db")

In [52]:
cursor = connection.cursor()
#cursor.execute("DROP TABLE papers")
#cursor.execute("DROP TABLE domain_paper_to_field")
#cursor.execute("DROP TABLE domain_paper_to_method")
cursor.execute("CREATE TABLE papers (ss_id TEXT, doi TEXT, title TEXT, year INTEGER, citation_count INTEGER, venue TEXT, is_domain INTEGER, is_method_primary INTEGER, is_method_secondary INTEGER, has_dr_vis INTEGER)")
cursor.execute("CREATE TABLE domain_paper_to_field (ss_id TEXT, field TEXT)")
cursor.execute("CREATE TABLE domain_paper_to_method (ss_id TEXT, method_acronym TEXT, method_ss_id TEXT)")

<sqlite3.Cursor at 0x285d576c0>

In [53]:
db = SqliteDatabase('star.db')
models = generate_models(db)

globals().update({
    "Paper": models['papers'],
    "DomainPaperToField": models['domain_paper_to_field'],
    "DomainPaperToMethod": models['domain_paper_to_method']
})

In [54]:
df = pd.read_csv("data/ss_id.csv", index_col=0)

In [55]:
df = df[['Acronym', 'ss_id']].reset_index(drop=True)
df.head()

Unnamed: 0,Acronym,ss_id
0,CCA,085049bec04020baecdb17bec196442a4abaedab
1,CHL,0f73d5a2669143d6fcbf4db0a4a9495bab115eb8
2,CLM,8d70b1981fc2947867a33f20256e88dc87fbb422
3,CuCA,bfb5410593385a279cc62844bd395f744dec2302
4,DM,60290252155782642d39145421ba924e78263553


In [56]:
def get_single_paper(paper_id):
    fields = "title,authors,year,fieldsOfStudy,s2FieldsOfStudy,venue,externalIds,citationCount"
    r_paper = requests.get(
        f'https://api.semanticscholar.org/graph/v1/paper/{paper_id}?fields={fields}'
    )
    r_dict = r_paper.json()
    return r_dict

In [57]:
def get_citing_papers(paper_id):
    fields = "title,authors,year,fieldsOfStudy,s2FieldsOfStudy,venue,externalIds,citationCount"
    limit = 1000
    offset = 0
    has_next = True
    all_data = []
    while has_next:
        r_paper = requests.get(
            f'https://api.semanticscholar.org/graph/v1/paper/{paper_id}/citations?fields={fields}&limit={limit}&offset={offset}'
        )
        r_dict = r_paper.json()
        if "data" in r_dict:
            all_data += r_dict["data"]
        if "next" in r_dict:
            offset += limit
            has_next = True
        else:
            has_next = False
    return all_data

In [58]:
get_single_paper('0f73d5a2669143d6fcbf4db0a4a9495bab115eb8')

{'paperId': '0f73d5a2669143d6fcbf4db0a4a9495bab115eb8',
 'externalIds': {'DBLP': 'conf/visualization/Chalmers96',
  'MAG': '2145519405',
  'DOI': '10.1109/VISUAL.1996.567787',
  'CorpusId': 789498},
 'title': 'A linear iteration time layout algorithm for visualising high-dimensional data',
 'venue': "Proceedings of Seventh Annual IEEE Visualization '96",
 'year': 1996,
 'citationCount': 231,
 'fieldsOfStudy': ['Computer Science'],
 's2FieldsOfStudy': [{'category': 'Computer Science', 'source': 'external'},
  {'category': 'Computer Science', 'source': 's2-fos-model'}],
 'authors': [{'authorId': '144175654', 'name': 'M. Chalmers'}]}

In [59]:
def get_doi(p_info):
    return (None if ('externalIds' not in p_info or p_info['externalIds'] is None) else p_info['externalIds'].get('DOI', None))

In [60]:
def clean_field(field):
    if field == "Agricultural and Food Sciences":
        return 'Agricultural And Food Sciences'
    return field

In [61]:
for row_i, row in df.iterrows():
    method_ss_id = row['ss_id']
    method_acronym = row['Acronym']

    method_info = get_single_paper(method_ss_id)
    paper_obj = Paper(
        ss_id=method_ss_id,
        doi=get_doi(method_info),
        title=method_info['title'],
        year=(int(method_info['year']) if method_info['year'] is not None else None),
        citation_count=method_info['citationCount'],
        venue=method_info['venue'],
        is_domain=0,
        is_method_primary=1,
        is_method_secondary=0,
        has_dr_vis=0
    )
    paper_obj.save()

    citing_papers = get_citing_papers(method_ss_id)
    for citing_paper_dict in citing_papers:
        cp_info = citing_paper_dict["citingPaper"]
        fields = cp_info["fieldsOfStudy"]
        s2_fields = cp_info["s2FieldsOfStudy"]
        if fields is None and s2_fields is not None:
            fields = [ d["category"] for d in s2_fields ]
        year = cp_info["year"]
        if year is not None and year >= 2013:
            is_domain = fields is not None and not set(fields).issubset({ "Computer Science", "Mathematics", "Engineering" })
            paper_obj = Paper(
                ss_id=cp_info['paperId'],
                doi=get_doi(cp_info),
                title=cp_info['title'],
                year=int(cp_info['year']),
                citation_count=cp_info['citationCount'],
                venue=cp_info['venue'],
                is_domain=(1 if is_domain else 0),
                is_method_primary=0,
                is_method_secondary=0,
                has_dr_vis=0
            )
            paper_obj.save()
    
            if is_domain:
                for field in fields:
                    dptf_obj = DomainPaperToField(
                        ss_id=cp_info['paperId'],
                        field=clean_field(field),
                    )
                    dptf_obj.save()
    
                dptm_obj = DomainPaperToMethod(
                    ss_id=cp_info['paperId'],
                    method_acronym=method_acronym,
                    method_ss_id=method_ss_id,
                )
                dptm_obj.save()