In [1]:
import requests
import pandas as pd
import numpy as np
from ratelimit import limits, RateLimitException, sleep_and_retry
from backoff import on_exception, expo

In [82]:
def api_lookup(query, obj="works", cursor=None):
    
    if cursor is not None:
        cursor = f"cursor={cursor}"
    else:
        cursor = "cursor=*"
    
    base = "https://api.openalex.org/"
    perpage = "per-page=200"
    mailto = "mailto=erlanger@mcc-berlin.net"
    
    response = requests.get(f"{base}{obj}?{query}&{perpage}&{cursor}&{mailto}").json()

    results = response["results"]
    cursor = response["meta"]["next_cursor"]
    
    return results, cursor

def scroll(query, obj="works"):
    
    print(f"Query Submitted: {query}")
    
    allResults = []
    results, cursor = api_lookup(query, obj=obj)
    allResults.append(results)
    
    while cursor is not None:
        results, cursor = api_lookup(query, obj=obj, cursor=cursor)
        allResults.append(results)
    
    df = pd.concat([pd.DataFrame(x) for x in allResults], ignore_index=True)
    df = clean_results(df)
    df["q_params"] = query

    print(f"OpenAlex Response Size: {len(df.index)} Rows")
    
    return df

def clean_results(df):
    
    df["doi"] = df["ids"].astype('str').str.extract(r"'doi': \'https:\/\/doi.org\/([^\']+)'")
    df["mag"] = df["ids"].astype('str').str.extract(r"'mag': \'(\d+)'")
    df["lcase_ttl"] = df["title"].str.lower()
    df["compressed_index"] = df["abstract_inverted_index"].apply(lambda x: x.keys() if pd.notnull(x) else "N/A")
    df["data_source"] = "openAlex"
    
    return df

df1 = scroll(q1)

Query Submitted: search="emission reduction"
OpenAlex Response Size: 8065 Rows


In [88]:
# Run Eval Queries #

q1 = "search=\"emission reduction\""
q2 = "search=\"capture and storage\""

df_q1 = scroll(q1)

#12.04.2022: The API has a lot of difficulty with handling complex queries. Basic logic is OR only
#You can stack with filters but filters only on title for now, not possible to do abstracts, best course of aciton is through snapshot
#and client-side filtering with e.g. regex

df_q2 = scroll(q2)

Query Submitted: search="emission reduction"
OpenAlex Response Size: 8065 Rows
Query Submitted: search="capture and storage"
OpenAlex Response Size: 2841 Rows


In [89]:
# Regex Filtering for Complex Query #

# Todo: return to later and fix it so it evaluates all possible columns
# mask = df_q2[["lcase_ttl","display_name"]].apply(lambda x: x.str.match(r"[cC](02|arbon)") & x.str.match(r"[uU](tili.ation|sage)"))

df_q2_fltr = df_q2[df_q2["lcase_ttl"].str.match(r"^(?=.*[cC](02|arbon))(?=.*[uU](tili.ation|sage)).*$")]

In [90]:
# Import Control Groups #

def fix_controls(df):
    df["lcase_ttl"] = df["title"].str.lower().astype(str)
    df["pubyear"] = df["pubyear"].fillna(0).astype(int)
    df["doi"] = df["doi"].astype(str)
    
    return df, df[df["query_appears_in_title"]]

df_ctrl1 = pd.read_csv("../engineEval/wos_dataset/wos_emission_query.csv")
df_ctrl2 = pd.read_csv("../engineEval/wos_dataset/wos_ccs_query.csv")
    
df_ctrl1, df_ctrl1_fltr = fix_controls(df_ctrl1)
df_ctrl2, df_ctrl2_fltr = fix_controls(df_ctrl2)

In [91]:
# Titles Only DOI Comparison While I Work on Abstract #

def doi_eval(df1,df2):
    df_results = pd.merge(df1,df2, on=["doi","doi"])
    
    print("==== Start ====")
    print(f"Matches for {df_results['q_params'][0]} Title Only")
    print(f"WoS: {len(df1.index)}")
    print(f"OA: {len(df2.index)}")
    print(f"Simple doi match: {len(df_results.index)} matches ({len(df_results.index)/len(df1.index):.0%} coverage)")
    print("==== Finish =====")

doi_eval(df_ctrl1_fltr,df_q1)
doi_eval(df_ctrl2_fltr,df_q2_fltr)

==== Start ====
Matches for search="emission reduction" Title Only
WoS: 455
OA: 8065
Simple doi match: 365 matches (80% coverage)
==== Finish =====
==== Start ====
Matches for search="capture and storage" Title Only
WoS: 4
OA: 26
Simple doi match: 3 matches (75% coverage)
==== Finish =====
