In [1]:
import requests
import pandas as pd
import numpy as np
import pandas_profiling as pp
from ratelimit import limits, RateLimitException, sleep_and_retry
from backoff import on_exception, expo
from local.config import core

In [250]:
# CORE API 3.0.0 BETA #

apikey = core.api_key

@sleep_and_retry
@limits(calls=4, period=100)
def query_api(query, scrollId=None):
    
    headers={"Authorization":"Bearer "+apikey}
    search_url = "https://api.core.ac.uk/v3/search/works"
    
    if not scrollId:
        response = requests.get(f"{search_url}?q={query}&limit=75&scroll=true",headers=headers)
    else:
        response = requests.get(f"{search_url}?q={query}&limit=75&scrollId={scrollId}",headers=headers)        
    
    headers = response.headers
    status_code = response.status_code
    print(f"Code: {status_code} | RateLimit: {headers['X-RateLimit-Remaining']} / {headers['X-RateLimit-Limit']} | Retry: {headers['X-RateLimit-Retry-After']}")
        
    hit = response.json()
    
    if "results" in hit:
        for element in hit["results"]: 
            if "fullText" in element:
                del element["fullText"]

    return hit, status_code

def scroll(query):
    allResults = []
    scrollId = None
    
    result, status_code = query_api(query)
    
    scrollId = result["scrollId"]
    totalHits = int(result["totalHits"])
    hitCount = int(result["offset"]) + int(result["limit"])
    allResults.append(result["results"])
    print(f"{hitCount} < {totalHits}")
    
    while (scrollId is not None) & (hitCount < totalHits):
        result, status_code = query_api(query, scrollId=scrollId)
        
        if "results" in result:
            scrollId = result["scrollId"]
            hitCount = hitCount + int(result["limit"])
            print(f"{hitCount} < {totalHits}")
            allResults.append(result["results"])
        elif status_code == 500:
            continue
        else:
            break
    
    return allResults

def clean_results(results):
    
    
    def parse_journal_name(row):
        for x in row:
            if len(x) > 0:
                return x["title"]
            else:
                return "N/A"

    def parse_identifiers(row):
        for x in row:
            if len(x) > 0:
                return x["identifiers"]
            else:
                return "N/A"
        
    def parse_identifiers_issn(row):
        id_array = []
        if row is not None:
            for x in row:
                if not x.startswith("issn"):
                    id_array.append(x)

            return id_array

    results = [item for sublist in results for item in sublist]

    keep_col = ["id","magId","oaiIds","doi","documentType","title","abstract","publisher","yearPublished","journals","dataProviders","identifiers","authors"]

    df = pd.DataFrame(results).loc[:,keep_col]
    df["lcase_ttl"] = df["title"].str.lower()
    df["abstract"] = df["abstract"].str.lower()
    df["publisher"] = df["publisher"].str.lower().apply(lambda x: x[1:-1] if x.startswith("'") & x.endswith("'") else x)
    df["pubyear"] = df["yearPublished"].fillna(0).astype(int)    
    df["documentType"] = df["documentType"].apply(lambda x: None if x == "" else x)
    
    df["journal_name"] = df["journals"].apply(lambda x: parse_journal_name(x))
    df["identifiers"] = df["journals"].apply(lambda x: parse_identifiers(x))
    df["issns"] = df["identifiers"].apply(lambda x: parse_identifiers_issn(x))
    
    df.drop(columns=["yearPublished","journals","identifiers","title"], inplace=True)
    
    return df

In [7]:
q = '(("c02 capture and storage")+OR+("carbon capture and storage"))+(utili?ation+OR+usage)+yearPublished:2015'
q2_2015 = scroll(q)

Code: 200 | RateLimit: 8 / 10 | Retry: 2022-05-10T10:19:56+0000
75 < 675
Code: 200 | RateLimit: 6 / 10 | Retry: 2022-05-10T10:19:59+0000
150 < 675
Code: 200 | RateLimit: 4 / 10 | Retry: 2022-05-10T10:20:03+0000
225 < 675
Code: 200 | RateLimit: 2 / 10 | Retry: 2022-05-10T10:20:06+0000
300 < 675
Code: 200 | RateLimit: 8 / 10 | Retry: 2022-05-10T10:21:37+0000
375 < 675
Code: 200 | RateLimit: 6 / 10 | Retry: 2022-05-10T10:21:44+0000
450 < 675
Code: 200 | RateLimit: 4 / 10 | Retry: 2022-05-10T10:21:53+0000
525 < 675
Code: 200 | RateLimit: 2 / 10 | Retry: 2022-05-10T10:22:01+0000
600 < 675
Code: 500 | RateLimit: 8, 8 / 10, 10 | Retry: 2022-05-10T10:23:18+0000, 2022-05-10T10:23:18+0000
Code: 200 | RateLimit: 6 / 10 | Retry: 2022-05-10T10:23:19+0000
675 < 675


In [8]:
q_2021 = '(("c02 capture and storage")+OR+("carbon capture and storage"))+(utili?ation+OR+usage)+yearPublished:2021'
q2_2021 = scroll(q_2021)

Code: 200 | RateLimit: 8 / 10 | Retry: 2022-05-10T10:25:31+0000
75 < 456
Code: 200 | RateLimit: 6 / 10 | Retry: 2022-05-10T10:25:36+0000
150 < 456
Code: 200 | RateLimit: 4 / 10 | Retry: 2022-05-10T10:25:41+0000
225 < 456
Code: 200 | RateLimit: 2 / 10 | Retry: 2022-05-10T10:25:49+0000
300 < 456
Code: 200 | RateLimit: 8 / 10 | Retry: 2022-05-10T10:27:13+0000
375 < 456
Code: 200 | RateLimit: 6 / 10 | Retry: 2022-05-10T10:27:22+0000
450 < 456
Code: 200 | RateLimit: 4 / 10 | Retry: 2022-05-10T10:27:27+0000
525 < 456


In [9]:
q_2022 = '(("c02 capture and storage")+OR+("carbon capture and storage"))+(utili?ation+OR+usage)+yearPublished:2022'
q2_2022 = scroll(q_2022)

Code: 200 | RateLimit: 8 / 10 | Retry: 2022-05-10T10:29:59+0000
75 < 9


In [251]:
df_q2 = pd.concat([clean_results(q2_2015),clean_results(q2_2021),clean_results(q2_2022)], ignore_index=True)

In [295]:
#df_q2

In [288]:
dataProviders = df_q2["dataProviders"].explode("dataProviders").drop_duplicates()