In [1]:
import requests
import pandas as pd
import numpy as np
from ratelimit import limits, RateLimitException, sleep_and_retry
from backoff import on_exception, expo

In [2]:
def searchOA(q):
    
    base = "https://api.openalex.org/"
    obj = "works"
    perpage = "per-page=200"
    cursor = "cursor=*"
    mailto = "mailto=erlanger@mcc-berlin.net"
    
    response = requests.get(f"{base}{obj}?{q}&{perpage}&{cursor}&{mailto}").json()

    df_oa = pd.DataFrame(response["results"])
    #df_meta = pd.DataFrame(response["meta"], index=[0])

    pag = response["meta"]["next_cursor"]
    while pag is not None:
        response = requests.get(f"{base}{obj}?{q}&{perpage}&cursor={pag}&{mailto}").json()
        pag = response["meta"]["next_cursor"]
        df_oa = df_oa.append(response["results"])
        
    df_oa["doi"] = df_oa["ids"].astype('str').str.extract(r"'doi': \'([^\']+)'")
    df_oa["mag"] = df_oa["ids"].astype('str').str.extract(r"'mag': \'(\d+)'")
    df_oa["data_source"] = "openAlex"
    df_oa["q_params"] = q
    
    return df_oa

In [3]:
q1 = "search=\"emission reduction\""

df_q1 = searchOA(q1)
len(df_q1.index)

8065

In [4]:
# q2 #"((\"Carbon capture and Storage\" OR \"CO2 capture and storage\") AND (\"Utili*ation\" OR \"usage\"))"

#12.04.2022: The API has a lot of difficulty with handling complex queries. Basic logic is OR only
#You can stack with filters but filters only on title for now, not possible to do abstracts, best course of aciton is through snapshot
#and mcc-side filtering with e.g. regex

q2 = "search=\"capture and storage\""

df_q2 = searchOA(q2)
len(df_q2.index)

2841

In [None]:
df_q2.loc[lambda x: len(x["abstract_inverted_index"]) ]

In [None]:
# CORE API 3.0.0 BETA #

apikey = "api_key"

#@on_exception(expo, RateLimitException, max_tries=5)
@sleep_and_retry
@limits(calls=5, period=61)
def query_api(query, scrollId=None):
    
    headers={"Authorization":"Bearer "+apikey}
    search_url = "https://api.core.ac.uk/v3/search/works"
    
    if not scrollId:
        response = requests.get(f"{search_url}?q={query}&limit=50&scroll=true",headers=headers)
    else:
        response = requests.get(f"{search_url}?q={query}&limit=50&scrollId={scrollId}",headers=headers)        
    
    print(response.status_code)
    print(response.headers)
    if response.status_code != 200:
        raise Exception(f"API response: {response.status_code}")
        
    hit = response.json()
    
    for element in hit["results"]: 
        if "fullText" in element:
            del element["fullText"]

    return hit

def iter_query(query):
    allResults = []
    scrollId = None
    
    result = query_api(query)
    
    allResults.append(result["results"])
    scrollId = result["scrollId"]
    while scrollId is not None:
        result = query_api(query, scrollId=scrollId)
        scrollId = result["scrollId"]
        allResults.append(result["results"])

In [None]:
test = iter_query("katonah+baseball")

In [None]:
test