In [48]:
import requests
import json
import os
import time
import pandas as pd
import numpy as np
from ratelimit import limits, RateLimitException, sleep_and_retry
from backoff import on_exception, expo
from local.config import semanticScholar

In [25]:
@sleep_and_retry
@limits(calls=10, period=32)
def query_api(query, scroll=0):
    
    base = "https://api.semanticscholar.org/graph/v1"
    obj = "paper"
    limit = 100
    fields = "abstract,title,year,externalIds"
    
    response = requests.get(f"{base}/{obj}/search?query={query}&limit={limit}&offset={scroll}&fields={fields}").json()

    return response

def scroll(query, scroll=0):
    
    allResults = []
    
    result = query_api(query, scroll=scroll)
    totalResults = result["total"]
    allResults.append(result["data"])
    
    #Their free API only allows the top 10,000 results. Blocks if offset + limit > 10,000
    while scroll <= 9800:
        scroll = result["next"]
        result = query_api(query, scroll=scroll)
        allResults.append(result["data"])
        
    return allResults

def clean_results(df):
    
    # Clean Columns
    df["lcase_ttl"] = df["title"].str.lower()
    df["abstract"] = df["abstract"].str.lower()
    df["publication_year"] = df["year"].fillna(0).astype(int)
    
    # Transform External IDs Array Into Columns
    df["values"] = df["externalIds"].apply(lambda x: x.values())
    df["keys"] = df["externalIds"].apply(lambda x: x.keys())
    df = df.explode(['keys','values']).drop("externalIds", axis=1).drop_duplicates()
    df["keys"] = df["keys"].str.lower()
    df = df.pivot(index=["paperId","lcase_ttl","abstract","publication_year"], columns="keys", values="values").reset_index().set_index("paperId")
    df["data_source"] = "semanticScholar"
    
    df = df.drop_duplicates()
    
    return df

#result = scroll("emission reduction")
#result_ccs = scroll("capture and storage")
#df1 = pd.DataFrame(result["data"])
#df2 = pd.DataFrame(result_ccs["data"])

In [26]:
# Temp Fix Because API was taking too long, created separate script to download results to json #

jsons = []

for file in os.listdir("../engineEval/semanticScholar/q1_results/"):
    if file.endswith(".json"):
        with open(f"../engineEval/semanticScholar/q1_results/{file}", 'r') as j:
            jsons.append(json.loads(j.read()))

df1 = pd.concat([pd.DataFrame(x["data"]) for x in jsons], ignore_index=True)
df_q1 = clean_results(df1)
df_q1["q_params"] = "emission reduction"

jsons = []

for file in os.listdir("../engineEval/semanticScholar/q2_results/"):
    if file.endswith(".json"):
        with open(f"../engineEval/semanticScholar/q2_results/{file}", 'r') as j:
            jsons.append(json.loads(j.read()))

df2 = pd.concat([pd.DataFrame(x["data"]) for x in jsons], ignore_index=True)
df_q2 = clean_results(df2)
df_q2 = df_q2.loc[(df_q2["abstract"].fillna("").str.match(r"^(?=.*[uU](tili.ation|sage)).*$")) |
                  (df_q2["lcase_ttl"].fillna("").str.match(r"^(?=.*[uU](tili.ation|sage)).*$"))]
df_q2["q_params"] = "carbon/c02 capture and storage"

In [17]:
def fix_controls(df):
    
    columns = ["nacsos_id","wos_id","lcase_ttl","authors","abstract","doi","pubyear","query_appears_in_title"]
    df["lcase_ttl"] = df["title"].str.lower().astype(str)
    df["pubyear"] = df["pubyear"].fillna(0).astype(int)
    df["doi"] = df["doi"].astype(str)
    
    df = df.loc[:,columns]
    
    df_ttl = df.loc[df["query_appears_in_title"]].drop_duplicates().reset_index(drop=True)
    df = df.loc[:,df.columns != "query_appears_in_title"].drop_duplicates().reset_index(drop=True)
    
    return df, df_ttl

df_ctrl1 = pd.read_csv("../engineEval/wos_dataset/wos_emission_query.csv")
df_ctrl2 = pd.read_csv("../engineEval/wos_dataset/wos_ccs_query.csv")
    
df_ctrl1, df_ctrl1_ttl = fix_controls(df_ctrl1)
df_ctrl2, df_ctrl2_ttl = fix_controls(df_ctrl2)

In [29]:
def overlap_eval(df1,df2):
    
    df_t = pd.merge(df1,df2, on="lcase_ttl", suffixes=(None,"_api")).drop("doi_api", axis=1).reset_index(drop=True)
    df_d = pd.merge(df1,df2, on="doi", suffixes=(None,"_api")).drop("lcase_ttl_api", axis=1).reset_index(drop=True)
    df_td = pd.concat([df_t,df_d])

    df_td = df_td.loc[:,["nacsos_id","wos_id","doi","pubyear","publication_year","lcase_ttl","q_params","data_source"]]\
                .drop_duplicates().reset_index(drop=True)
    
    results = {"wos_input" : len(df1.index),
               "api_input" : len(df2.index),
               "intersection_doi" : len(df_d.index),
               "intersection_title" : len(df_t.index),
               "intersection_doi_and_title" : len(df_td.index),
               "query_criteria" : df2["q_params"].iloc[0],
               "query_source" : df2["data_source"].iloc[0]
              }
    
    return results

toplineResults = []
yearlyResults = []

# Q1
toplineResults.append(overlap_eval(df_ctrl1,df_q1))

for i in [1995,2015,2021,2022]:
    
    year = overlap_eval(df_ctrl1.loc[df_ctrl1["pubyear"] == i],df_q1.loc[df_q1["publication_year"] == i])
    year.update({"year":i, "query":"1"})
    yearlyResults.append(year)

# Q2
toplineResults.append(overlap_eval(df_ctrl2,df_q2))

for i in [2015,2021,2022]:
    
    year = overlap_eval(df_ctrl2.loc[df_ctrl2["pubyear"] == i],df_q2.loc[df_q2["publication_year"] == i])
    year.update({"year":i, "query":"2"})
    yearlyResults.append(year)

df_results = pd.DataFrame(toplineResults).set_index(["query_source","query_criteria"])
df_results_yr = pd.DataFrame(yearlyResults).set_index(["query_source","query_criteria","year","query"])

In [30]:
df_results

Unnamed: 0_level_0,Unnamed: 1_level_0,wos_input,api_input,intersection_doi,intersection_title,intersection_doi_and_title
query_source,query_criteria,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
semanticScholar,emission reduction,3160,8242,62,82,104
semanticScholar,carbon/c02 capture and storage,134,1104,40,41,48


In [31]:
df_results_yr

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,wos_input,api_input,intersection_doi,intersection_title,intersection_doi_and_title
query_source,query_criteria,year,query,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
semanticScholar,emission reduction,1995,1,35,72,4,2,4
semanticScholar,emission reduction,2015,1,819,419,37,58,71
semanticScholar,emission reduction,2021,1,1995,38,8,8,12
semanticScholar,emission reduction,2022,1,189,1,0,0,0
semanticScholar,carbon/c02 capture and storage,2015,2,38,74,9,10,13
semanticScholar,carbon/c02 capture and storage,2021,2,86,167,23,23,27
semanticScholar,carbon/c02 capture and storage,2022,2,7,47,1,1,1


In [49]:
### Datasets Download ###

api_key = semanticScholar.api_key
headers= {"X-API-KEY": api_key}

@sleep_and_retry
@limits(calls=10, period=32)
def get_datasets(release_id=None, dataset=None):
    
    if release_id is None:
        url = "https://api.semanticscholar.org/datasets/v1/release/"
    elif dataset is None:
        url = f"https://api.semanticscholar.org/datasets/v1/release/{release_id}"
    else:
        url = f"https://api.semanticscholar.org/datasets/v1/release/{release_id}/dataset/{dataset}"
    
    response = requests.get(f"{url}", headers=headers).json()

    return response

In [25]:
newest_release = max(get_datasets())
datasets = get_datasets(release_id=newest_release)

In [26]:
urls = []
timestr = time.strftime("%Y%m%d-%H%M%S")

for r in datasets["datasets"]:
    result = get_datasets(release_id=newest_release, dataset=r["name"])
    urls.append(result)

In [47]:
# Write to txt to wget on #

description = []

if not os.path.exists(f"./local/{newest_release}/{timestr}/links/"):
    os.makedirs(f"./local/{newest_release}/{timestr}/links/")

for r in urls:
    filename = r["name"]
    meta = { "dataset" : r["name"],
            "description" : r["description"],
            "README": r["README"]
                  }
    
    with open(f"./local/{newest_release}/{timestr}/links/{filename}_{timestr}.txt", "w") as f:
        f.write('\n'.join(r["files"]))
    
    description.append(meta)

with open(f"./local/{newest_release}/{timestr}/description.json", "w") as f:
          f.write(json.dumps(description, indent=2))
        