In [10]:
import requests
import pandas as pd
import numpy as np
from ratelimit import limits, RateLimitException, sleep_and_retry
from backoff import on_exception, expo
from local.config import core

In [135]:
# CORE API 3.0.0 BETA #

apikey = core.api_key

@sleep_and_retry
@limits(calls=4, period=100)
def query_api(query, scrollId=None):
    
    headers={"Authorization":"Bearer "+apikey}
    search_url = "https://api.core.ac.uk/v3/search/works"
    
    if not scrollId:
        response = requests.get(f"{search_url}?q={query}&limit=75&scroll=true",headers=headers)
    else:
        response = requests.get(f"{search_url}?q={query}&limit=75&scrollId={scrollId}",headers=headers)        
    
    headers = response.headers
    status_code = response.status_code
    print(f"Code: {status_code} | RateLimit: {headers['X-RateLimit-Remaining']} / {headers['X-RateLimit-Limit']} | Retry: {headers['X-RateLimit-Retry-After']}")
    
    #if response.status_code != 200:
    #    raise Exception(f"API response: {response.status_code}")
        
    hit = response.json()
    
    if "results" in hit:
        for element in hit["results"]: 
            if "fullText" in element:
                del element["fullText"]

    return hit, status_code

def scroll(query):
    allResults = []
    scrollId = None
    
    result, status_code = query_api(query)
    
    scrollId = result["scrollId"]
    totalHits = int(result["totalHits"])
    hitCount = int(result["offset"]) + int(result["limit"])
    allResults.append(result["results"])
    print(f"{len(scrollId)}: {hitCount} < {totalHits}")
    
    while (scrollId is not None) & (hitCount < totalHits):
        result, status_code = query_api(query, scrollId=scrollId)
        
        if "results" in result:
            scrollId = result["scrollId"]
            hitCount = hitCount + int(result["limit"])
            print(f"{len(scrollId)}: {hitCount} < {totalHits}")
            allResults.append(result["results"])
        elif status_code == 500:
            continue
        else:
            break
    
    return allResults

In [136]:
q = '(("c02 capture and storage")+OR+("carbon capture and storage"))+(utili?ation+OR+usage)+yearPublished:2015'
q2_2015 = scroll(q)

Code: 200 | RateLimit: 8 / 10 | Retry: 2022-05-04T13:10:17+0000
436: 75 < 675
Code: 200 | RateLimit: 6 / 10 | Retry: 2022-05-04T13:10:20+0000
436: 150 < 675
Code: 200 | RateLimit: 4 / 10 | Retry: 2022-05-04T13:10:24+0000
436: 225 < 675
Code: 200 | RateLimit: 2 / 10 | Retry: 2022-05-04T13:10:27+0000
436: 300 < 675
Code: 200 | RateLimit: 8 / 10 | Retry: 2022-05-04T13:11:31+0000
436: 375 < 675
Code: 200 | RateLimit: 6 / 10 | Retry: 2022-05-04T13:11:41+0000
436: 450 < 675
Code: 200 | RateLimit: 4 / 10 | Retry: 2022-05-04T13:11:52+0000
436: 525 < 675
Code: 200 | RateLimit: 2 / 10 | Retry: 2022-05-04T13:12:05+0000
436: 600 < 675
Code: 500 | RateLimit: 8, 8 / 10, 10 | Retry: 2022-05-04T13:13:13+0000, 2022-05-04T13:13:13+0000
Code: 200 | RateLimit: 6 / 10 | Retry: 2022-05-04T13:13:14+0000
436: 675 < 675


In [146]:
with open("q2_2015_results.txt", "a") as o:
    for x in q2_2015:
        o.write(f"{x}\n")

In [141]:
def clean_results(results):

    keep_col = ["id","magId","oaiIds","doi","documentType","title","abstract","publisher","yearPublished","journals","dataProviders","identifiers","authors"]

    df = pd.DataFrame(results["results"], index=[0]).loc[:,keep_col]
    df["lcase_ttl"] = df["title"].str.lower()
    df["abstract"] = df["abstract"].str.lower()
    df["publisher"] = df["publisher"].str.lower().apply(lambda x: x[1:-1] if x.startswith("'") & x.endswith("'") else x)
    df["pubyear"] = df["yearPublished"].fillna(0).astype(int)
    
    return df

df_test = pd.concat([clean_results(x) for x in q2_2015], ignore_index=True)

TypeError: list indices must be integers or slices, not str

In [142]:
q2_2015[0]

[{'acceptedDate': None,
  'arxivId': None,
  'authors': [{'name': 'Boait, Fran'},
   {'name': 'JafarGandomi, Arash'},
   {'name': 'Johnson, Gareth'}],
  'citationCount': None,
  'contributors': ['Nexen',
   'Scottish Enterprise',
   '2CoEnergy',
   'Scottish Government'],
  'outputs': ['https://api.core.ac.uk/v3/outputs/429705987'],
  'createdDate': '2021-06-07T15:41:38',
  'dataProviders': ['https://api.core.ac.uk/v3/data-providers/39'],
  'depositedDate': None,
  'abstract': 'This report assesses the differences between monitoring technology requirements for CO2 storage in a saline or depleted hydrocarbon reservoir and in a hydrocarbon reservoir, when CO2 injection is used for enhanced oil recovery (EOR).\nFirst order factors dictating technology choice including geological and geographic parameters are assessed before addressing differences introduced by the choice of process (EOR or storage). A brief review of the most common monitoring technologies suitable for use in either CO2 s

In [126]:
def fix_controls(df):
    df["lcase_ttl"] = df["title"].str.lower().astype(str)
    df["pubyear"] = df["pubyear"].fillna(0).astype(int)
    df["doi"] = df["doi"].astype(str)
    
    df = df.drop_duplicates().reset_index(drop=True)
    
    return df, df[df["query_appears_in_title"]]

df_ctrl1 = pd.read_csv("../engineEval/wos_dataset/wos_emission_query.csv")
df_ctrl2 = pd.read_csv("../engineEval/wos_dataset/wos_ccs_query.csv")
    
df_ctrl1, df_ctrl1_ttl = fix_controls(df_ctrl1)
df_ctrl2, df_ctrl2_ttl = fix_controls(df_ctrl2)

In [24]:
#pd.DataFrame(df_test.groupby("publisher").size(), columns=["count"]).sort_values(by=["count"], ascending=False).head(20)
#df_test.loc[df_test["publisher"] == ""].groupby(["documentType",~df_test["doi"].isnull()]).size()

In [25]:
df_test

Unnamed: 0,id,magId,oaiIds,doi,documentType,title,abstract,publisher,yearPublished,journals,dataProviders,identifiers,authors,lcase_ttl,pubyear
0,196515033,,[oai:era.ed.ac.uk:1842/15705],,,Central North Sea - CO2 Storage Hub Enabling C...,carbon capture & storage is widely recognised ...,scottish enterprise,2012.0,[],[https://api.core.ac.uk/v3/data-providers/39],"[{'identifier': 'oai:era.ed.ac.uk:1842/15705',...","[{'name': 'Scottish Enterprise'}, {'name': 'SC...",central north sea - co2 storage hub enabling c...,2012
1,419799,,[oai:nora.nerc.ac.uk:17308],10.1787/5km4q8rj3hxs-en,research,Carbon capture and storage,to stabilise atmospheric concentrations of car...,british geological survey,2010.0,"[{'title': None, 'identifiers': ['2079-2581']}]",[https://api.core.ac.uk/v3/data-providers/4786...,"[{'identifier': '10.1787/5km4q8rj3hxs-en', 'ty...","[{'name': 'Chadwick, Andy'}]",carbon capture and storage,2010
2,20881654,150716990,[oai:epub.wupperinst.org:3176],10.1007/978-3-540-88546-7_78,research,Carbon capture and storage,,"wuppertal : wuppertal institut für klima, umwe...",2009.0,[],[https://api.core.ac.uk/v3/data-providers/4786...,[{'identifier': '10.1007/978-3-540-88546-7_78'...,"[{'name': 'Viebahn, Peter'}, {'name': 'Fisched...",carbon capture and storage,2009
3,18941837,2022197271,[oai:apo.org.au:3738],10.1016/j.enpol.2008.09.058,research,Carbon capture and storage,carbon (dioxide) capture and storage (ccs) has...,nsw parliamentary research service,2008.0,"[{'title': None, 'identifiers': ['0301-4215']}]",[https://api.core.ac.uk/v3/data-providers/4786...,"[{'identifier': '205536280', 'type': 'CORE_ID'...","[{'name': 'Jon Gibbins'}, {'name': 'Hannah Cha...",carbon capture and storage,2008
4,23803942,,[oai:dspace.library.uu.nl:1874/272947],,research,Carbon Capture and Storage,"emissions of carbon dioxide, the most importan...",,2012.0,[],[https://api.core.ac.uk/v3/data-providers/988],[{'identifier': 'oai:dspace.library.uu.nl:1874...,"[{'name': 'Benson, S.M.'}, {'name': 'Bennaceur...",carbon capture and storage,2012
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
545,42978512,2143912906,"[oai:spiral.imperial.ac.uk:10044/1/27504, oai:...",10.1029/2011gl049680,research,Residual CO2 imaged with x-ray micro-tomography,"carbon capture and storage (ccs), where co2 is...",american geophysical union (agu),2011.0,"[{'title': None, 'identifiers': ['1944-8007', ...",[https://api.core.ac.uk/v3/data-providers/4786...,"[{'identifier': '10.1029/2011gl049680', 'type'...","[{'name': 'Iglauer, S'}, {'name': 'Paluszny, A...",residual co2 imaged with x-ray micro-tomography,2011
546,46381724,,[],10.1016/j.egypro.2011.02.437,research,Victorian carbon dioxide geological storage op...,"abstractin a carbon constrained environment, t...",published by elsevier ltd.,2011.0,"[{'title': None, 'identifiers': ['1876-6102']}]",[https://api.core.ac.uk/v3/data-providers/2610...,"[{'identifier': '2014205596', 'type': 'MAG_ID'...","[{'name': 'O’Brien, Geoffrey'}, {'name': 'Gunn...",victorian carbon dioxide geological storage op...,2011
547,37471889,,[oai:scholarship.law.duke.edu:faculty_scholars...,,research,Pursuing Geoengineering for Atmospheric Restor...,"geoengineering is fraught with problems, but r...",duke university school of law,2010.0,[],[https://api.core.ac.uk/v3/data-providers/1382],[{'identifier': 'oai:scholarship.law.duke.edu:...,"[{'name': 'Salzman, James'}, {'name': 'Jackson...",pursuing geoengineering for atmospheric restor...,2010
548,59312331,,"[oai:centaur.reading.ac.uk:77368, oai:nora.ner...",10.1038/s41467-018-05340-z,research,Land-use emissions play a critical role in lan...,scenarios that limit global warming to below 2...,nature publishing group,2018.0,"[{'title': None, 'identifiers': ['2041-1723', ...","[https://api.core.ac.uk/v3/data-providers/17, ...","[{'identifier': '263631303', 'type': 'CORE_ID'...","[{'name': 'Harper, Anna B.'}, {'name': 'Powell...",land-use emissions play a critical role in lan...,2018


id                 int64
magId             object
oaiIds            object
doi               object
documentType      object
title             object
abstract          object
publisher         object
yearPublished    float64
journals          object
dataProviders     object
identifiers       object
authors           object
lcase_ttl         object
pubyear            int64
dtype: object

[{'wos_input': 3615, 'api_input': 550, 'intersection_doi': 3, 'intersection_title': 0, 'intersection_doi_and_title': 3}]
[{'wos_input': 48, 'api_input': 1, 'intersection_doi': 0, 'intersection_title': 0, 'intersection_doi_and_title': 0, 'year': 1995}, {'wos_input': 939, 'api_input': 52, 'intersection_doi': 3, 'intersection_title': 0, 'intersection_doi_and_title': 3, 'year': 2015}, {'wos_input': 2271, 'api_input': 0, 'intersection_doi': 0, 'intersection_title': 0, 'intersection_doi_and_title': 0, 'year': 2021}, {'wos_input': 217, 'api_input': 0, 'intersection_doi': 0, 'intersection_title': 0, 'intersection_doi_and_title': 0, 'year': 2022}]
