In [1]:
import requests
import pandas as pd
import numpy as np
from ratelimit import limits, RateLimitException, sleep_and_retry
from backoff import on_exception, expo

In [68]:
# CORE API 3.0.0 BETA #

apikey = "api"

#@on_exception(expo, RateLimitException, max_tries=5)
@sleep_and_retry
@limits(calls=5, period=61)
def query_api(query, scrollId=None):
    
    headers={"Authorization":"Bearer "+apikey}
    search_url = "https://api.core.ac.uk/v3/search/works"
    
    if not scrollId:
        response = requests.get(f"{search_url}?q={query}&limit=550&scroll=true",headers=headers)
    else:
        response = requests.get(f"{search_url}?q={query}&limit=550&scrollId={scrollId}",headers=headers)        
    
    print(response.status_code)
    #print(response.headers)
    if response.status_code != 200:
        raise Exception(f"API response: {response.status_code}")
        
    hit = response.json()
    
    for element in hit["results"]: 
        if "fullText" in element:
            del element["fullText"]

    return hit

def iter_query(query):
    allResults = []
    scrollId = None
    
    result = query_api(query)
    
    allResults.append(result["results"])
    scrollId = result["scrollId"]
    while scrollId is not None:
        result = query_api(query, scrollId=scrollId)
        scrollId = result["scrollId"]
        allResults.append(result["results"])
    
    return allResults

In [74]:
test = query_api("emission+reduction")

500


Exception: API response: 500

In [125]:
def clean_results(results):

    keep_col = ["id","magId","oaiIds","doi","documentType","title","abstract","publisher","yearPublished","journals","dataProviders","identifiers","authors"]

    df = pd.DataFrame(results["results"]).loc[:,keep_col]
    df["lcase_ttl"] = df["title"].str.lower()
    df["abstract"] = df["abstract"].str.lower()
    df["publisher"] = df["publisher"].str.lower().apply(lambda x: x[1:-1] if x.startswith("'") & x.endswith("'") else x)
    df["pubyear"] = df["yearPublished"].fillna(0).astype(int)
    
    return df

df_test = clean_results(test)

In [126]:
def fix_controls(df):
    df["lcase_ttl"] = df["title"].str.lower().astype(str)
    df["pubyear"] = df["pubyear"].fillna(0).astype(int)
    df["doi"] = df["doi"].astype(str)
    
    df = df.drop_duplicates().reset_index(drop=True)
    
    return df, df[df["query_appears_in_title"]]

df_ctrl1 = pd.read_csv("../engineEval/wos_dataset/wos_emission_query.csv")
df_ctrl2 = pd.read_csv("../engineEval/wos_dataset/wos_ccs_query.csv")
    
df_ctrl1, df_ctrl1_ttl = fix_controls(df_ctrl1)
df_ctrl2, df_ctrl2_ttl = fix_controls(df_ctrl2)

In [127]:
pd.DataFrame(df_test.groupby("publisher").size(), columns=["count"]).sort_values(by=["count"], ascending=False).head(20)


Unnamed: 0_level_0,count
publisher,Unnamed: 1_level_1
,137
edp sciences,51
oxford university press (oup),51
iop publishing,36
elsevier bv,24
american astronomical society,21
wiley,19
springer science and business media llc,14
aip publishing,11
aquatic mammals journal,10


In [128]:
df_test.loc[df_test["publisher"] == ""].groupby(["documentType",~df_test["doi"].isnull()]).size()

documentType  doi  
              False      1
              True       2
research      False     28
              True     106
dtype: int64

In [129]:
df_test.dtypes

id                 int64
magId             object
oaiIds            object
doi               object
documentType      object
title             object
abstract          object
publisher         object
yearPublished    float64
journals          object
dataProviders     object
identifiers       object
authors           object
lcase_ttl         object
pubyear            int64
dtype: object

In [135]:
def overlap_eval(df1,df2):
    df_t = pd.merge(df1,df2, on=["lcase_ttl","lcase_ttl"])
    df_d = pd.merge(df1,df2, on=["doi","doi"])
    df_td = pd.concat([df_t,df_d])

    df_td = df_td.loc[:,["doi","pubyear_x","lcase_ttl"]]\
                .drop_duplicates().reset_index(drop=True)
    
    results = {"wos_input" : len(df1.index),
               "api_input" : len(df2.index),
               "intersection_doi" : len(df_d.index),
               "intersection_title" : len(df_t.index),
               "intersection_doi_and_title" : len(df_td.index),
               #"query_criteria" : df_td["q_params"][0],
               #"query_source" : df_td["data_source"][0]
              }
    
    return results

toplineResults = []
toplineResults.append(overlap_eval(df_ctrl1,df_test))
#toplineResults.append(overlap_eval(df_ctrl2,df_q2))

yearlyResults = []
for i in [1995,2015,2021,2022]:
    
    year = overlap_eval(df_ctrl1.loc[df_ctrl1["pubyear"] == i],df_test.loc[df_test["pubyear"] == i])
    year.update({"year":i})
    yearlyResults.append(year)

print(toplineResults)
print(yearlyResults)

[{'wos_input': 3615, 'api_input': 550, 'intersection_doi': 3, 'intersection_title': 0, 'intersection_doi_and_title': 3}]
[{'wos_input': 48, 'api_input': 1, 'intersection_doi': 0, 'intersection_title': 0, 'intersection_doi_and_title': 0, 'year': 1995}, {'wos_input': 939, 'api_input': 52, 'intersection_doi': 3, 'intersection_title': 0, 'intersection_doi_and_title': 3, 'year': 2015}, {'wos_input': 2271, 'api_input': 0, 'intersection_doi': 0, 'intersection_title': 0, 'intersection_doi_and_title': 0, 'year': 2021}, {'wos_input': 217, 'api_input': 0, 'intersection_doi': 0, 'intersection_title': 0, 'intersection_doi_and_title': 0, 'year': 2022}]
