In [90]:
import requests
import pandas as pd
import numpy as np
from ratelimit import limits, RateLimitException, sleep_and_retry
from backoff import on_exception, expo

In [91]:
def searchOA(q):
    
    base = "https://api.openalex.org/"
    obj = "works"
    perpage = "per-page=200"
    cursor = "cursor=*"
    mailto = "mailto=erlanger@mcc-berlin.net"
    
    response = requests.get(f"{base}{obj}?{q}&{perpage}&{cursor}&{mailto}").json()

    df_oa = pd.DataFrame(response["results"])
    #df_meta = pd.DataFrame(response["meta"], index=[0])

    pag = response["meta"]["next_cursor"]
    while pag is not None:
        response = requests.get(f"{base}{obj}?{q}&{perpage}&cursor={pag}&{mailto}").json()
        pag = response["meta"]["next_cursor"]
        df_oa = pd.concat([df_oa,pd.DataFrame(response["results"])], ignore_index=True)
        
    df_oa["doi"] = df_oa["ids"].astype('str').str.extract(r"'doi': \'https:\/\/doi.org\/([^\']+)'")
    df_oa["mag"] = df_oa["ids"].astype('str').str.extract(r"'mag': \'(\d+)'")
    df_oa["lcase_ttl"] = df_oa["title"].str.lower()
    df_oa["data_source"] = "openAlex"
    df_oa["q_params"] = q
    
    print(f"Query Submitted: {q}")
    print(f"OpenAlex Response Size: {len(df_oa.index)} Rows")
    
    return df_oa

In [92]:
# Run Eval Queries #

q1 = "search=\"emission reduction\""
q2 = "search=\"capture and storage\""

df_q1 = searchOA(q1)

#12.04.2022: The API has a lot of difficulty with handling complex queries. Basic logic is OR only
#You can stack with filters but filters only on title for now, not possible to do abstracts, best course of aciton is through snapshot
#and client-side filtering with e.g. regex

df_q2 = searchOA(q2)

Query Submitted: search="emission reduction"
OpenAlex Response Size: 8065 Rows
Query Submitted: search="capture and storage"
OpenAlex Response Size: 2841 Rows


In [147]:
# Regex Filtering for Complex Query #

# Todo: return to later and fix it so it evaluates all possible columns
# mask = df_q2[["lcase_ttl","display_name"]].apply(lambda x: x.str.match(r"[cC](02|arbon)") & x.str.match(r"[uU](tili.ation|sage)"))

df_q2_fltr = df_q2[df_q2["lcase_ttl"].str.match(r"^(?=.*[cC](02|arbon))(?=.*[uU](tili.ation|sage)).*$")]

In [151]:
# Import Control Groups #

def fix_controls(df):
    df["lcase_ttl"] = df["title"].str.lower().astype(str)
    df["pubyear"] = df["pubyear"].fillna(0).astype(int)
    df["doi"] = df["doi"].astype(str)
    
    return df, df[df["query_appears_in_title"]]

df_ctrl1 = pd.read_csv("../engineEval/wos_dataset/wos_emission_query.csv")
df_ctrl2 = pd.read_csv("../engineEval/wos_dataset/wos_ccs_query.csv")
    
df_ctrl1, df_ctrl1_fltr = fix_controls(df_eval1)
df_ctrl2, df_ctrl2_fltr = fix_controls(df_eval2)

In [152]:
# Titles Only DOI Comparison While I Work on Abstract #

def doi_eval(df1,df2):
    df_results = pd.merge(df1,df2, on=["doi","doi"])
    
    print("==== Start ====")
    print(f"Matches for {df_results['q_params'][0]} Title Only")
    print(f"WoS: {len(df1.index)}")
    print(f"OA: {len(df2.index)}")
    print(f"Simple doi match: {len(df_results.index)} matches")
    print("==== Finish =====")

doi_eval(df_ctrl1_fltr,df_q1)
doi_eval(df_ctrl2_fltr,df_q2_fltr)

==== Start ====
Matches for search="emission reduction" Title Only
WoS: 455
OA: 8065
Simple doi match: 365 matches
==== Finish =====
==== Start ====
Matches for search="capture and storage" Title Only
WoS: 4
OA: 26
Simple doi match: 3 matches
==== Finish =====
