In [30]:
import requests
import json
import os
import pandas as pd
import numpy as np
from ratelimit import limits, RateLimitException, sleep_and_retry
from backoff import on_exception, expo

In [130]:
@sleep_and_retry
@limits(calls=10, period=32)
def query_api(query, scroll=0):
    
    base = "https://api.semanticscholar.org/graph/v1"
    obj = "paper"
    limit = 100
    fields = "abstract,title,year,externalIds"
    
    response = requests.get(f"{base}/{obj}/search?query={query}&limit={limit}&offset={scroll}&fields={fields}").json()

    return response

def scroll(query, scroll=0):
    
    allResults = []
    
    result = query_api(query, scroll=scroll)
    totalResults = result["total"]
    allResults.append(result["data"])
    
    #Their free API only allows the top 10,000 results. Blocks if offset + limit > 10,000
    while scroll <= 9800:
        scroll = result["next"]
        result = query_api(query, scroll=scroll)
        allResults.append(result["data"])
        
    return allResults

def clean_results(df):
    
    # Clean Columns
    df["title"] = df["title"].str.lower()
    df["abstract"] = df["abstract"].str.lower()
    df["year"] = df["year"].fillna(0).astype(int)
    
    # Transform External IDs Array Into Columns
    df["values"] = df["externalIds"].apply(lambda x: x.values())
    df["keys"] = df["externalIds"].apply(lambda x: x.keys())
    df = df.explode(['keys','values']).drop("externalIds", axis=1).drop_duplicates()
    df = df.pivot(index=["paperId","title","abstract","year"], columns="keys", values="values").reset_index().set_index("paperId")
    
    df = df.drop_duplicates()
    
    return df

#result = scroll("emission reduction")
#result_ccs = scroll("capture and storage")
#df1 = pd.DataFrame(result["data"])
#df2 = pd.DataFrame(result_ccs["data"])

In [101]:
# Regex is producing inconsistent results, need to look into

#df_q2 = df2.loc[(df["abstract"].fillna("").str.match(r"^(?=.*[cC](02|arbon))(?=.*[uU](tili.ation|sage)).*$")),["title","abstract"]]

In [131]:
# Temp Fix Because API was taking too long, created separate script to download results to json #

jsons = []

for file in os.listdir("../engineEval/semanticScholar/q1_results/"):
    if file.endswith(".json"):
        with open(f"../engineEval/semanticScholar/q1_results/{file}", 'r') as j:
            jsons.append(json.loads(j.read()))

df1 = pd.concat([pd.DataFrame(x["data"]) for x in jsons], ignore_index=True)
df_q1 = clean_results(df1)

jsons = []

for file in os.listdir("../engineEval/semanticScholar/q2_results/"):
    if file.endswith(".json"):
        with open(f"../engineEval/semanticScholar/q2_results/{file}", 'r') as j:
            jsons.append(json.loads(j.read()))

df2 = pd.concat([pd.DataFrame(x["data"]) for x in jsons], ignore_index=True)
df_q2 = clean_results(df2)

In [167]:
df_q2 = df_q2.loc[(df_q2["abstract"].fillna("").str.match(r"^(?=.*[uU](tili.ation|sage)).*$"))]

In [168]:
df_q2

keys,title,abstract,year,ArXiv,CorpusId,DBLP,DOI,MAG,PubMed,PubMedCentral
paperId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
00555120ba09661655a05ece06f2087fbc4d692e,multidisciplinary assessment of a novel carbon...,the current work investigates the feasibility ...,2022,,246433942,,10.3390/en15031021,,,
00c46b5d32e1141cfb7b53e47e1ca25bed9a91f1,the system-wide economics of a carbon dioxide ...,this letter compares several bounding cases fo...,2013,,7846139,,10.1088/1748-9326/8/3/034030,2135935756,,
00da2e575206a9e44d6963c43cd07cacbb2bf10d,communicating leakage risk in the hydrogen eco...,hydrogen may play a crucial part in delivering...,2019,,203506038,,,2973035317,,
010e0da5c8992b0d151ccd49d15cd179842f5ad4,development of oxy-fuel igcc system with co2 r...,"to cope with global warming problem, utility c...",2011,,112509992,,10.1115/POWER2011-55458,2801854867,,
014a8756759eded1fe0ca37ce71a8c29c49d438c,enhanced coal bed methane recovery finalized t...,the recovery of coal bed methane can be enhanc...,2009,,132787715,,10.3929/ETHZ-A-005916139,2514638653,,
...,...,...,...,...,...,...,...,...,...,...
fec90f9b7a8f67aaae5f1eda3cab15c69b32abed,development of multimode gas-fired combined-cy...,operation of power plants with carbon dioxide ...,2022,,247572171,,10.1007/s11356-022-19748-0,,35307797,
fee3ae26720f7c7022517f99228b3dfd867dbb6c,smart energy consumption feedback - connecting...,tinuously increasing and now accounts for abou...,2013,,45777568,journals/ercim/WeissMB13,,2402440740,,
ff0fea834ada92594c8c37f6ee5daefbc955f1c9,electrochemical carbon dioxide capture and rel...,anthropogenic carbon dioxide (co2) emission fr...,2022,,245914024,,10.1021/jacs.1c10656,,35020393,
ff463573a1aeb6278afa0a9dba61723af4b91d6c,system for capturing carbon dioxide from exhau...,the exhaust gas carbon capture system is discl...,2009,,103565568,,,2774618096,,
