In [1]:
import requests
import pandas as pd
from time import sleep
from ratelimit import limits, sleep_and_retry

In [2]:
TOKEN = ""

In [3]:
def chunks(l, n=999):
    """Yield successive n-sized chunks from lst."""
    for i in range(0, len(l), n):
        yield l[i:i + n]


@sleep_and_retry
@limits(calls=9, period=60)
def remote_call(dois: list, token: str):
    """Get fields of study using a list of dois."""
    headers = {
        "Authorization": f"Bearer {token}",
        "Content-Type": "application/json"
    }
    data = """{
        "query": {
            "terms": {
                "doi": REPLACE
            }
        },
        "size": 1000,
        "include": ["fields_of_study", "external_ids"]
    }""".replace("REPLACE", str(dois).replace("'", '"'))
    response = requests.post(
        "https://api.lens.org/scholarly/search", 
        data=data, 
        headers=headers
    )
    return response.json()


def process(response):
    """Yield processed article items"""
    data = response["data"]
    for item in data:
        yield {
            "doi": "".join(doi["value"] for doi in item["external_ids"] if doi["type"]=="doi"),
            "fields": "_".join(item.get("fields_of_study", "")) or "No data."
        }

In [4]:
df = pd.read_csv("../data/deterrence-broad-scite.csv")

In [5]:
# valid DOIs
DOIs = df.loc[df["doi"].str.contains("^10\.\d{4,9}/[-._;()/:a-zA-Z0-9]+$"), "doi"].tolist()

In [6]:
container = []
for chunk in chunks(DOIs):
    response = remote_call(dois=chunk, token=TOKEN)
    data = process(response)
    container.append(
        pd.DataFrame(data)
    )

In [7]:
result = pd.concat(container, ignore_index=True)

In [8]:
result.shape

(18393, 2)

In [63]:
# with "No data." removed
result.loc[result["fields"].ne("No data.")].shape

(15187, 5)

In [46]:
result["first_approach"] = result["fields"].str.contains("Political science|International relations|International security", case=False).astype(int)
result["second_approach"] = result["fields"].str.contains("International relations|International security", case=False).astype(int)
result["third_approach"] = (
    (result["fields"].str.contains("Political science|International relations|International security", case=False)) &  
    (~result["fields"].str.contains("criminal", case=False))
).astype(int)

In [49]:
result["first_approach"].sum()

2860

In [51]:
result.loc[result["fields"].ne("No data.")]["first_approach"].value_counts(normalize=True)

0    0.811681
1    0.188319
Name: first_approach, dtype: float64

In [50]:
result["second_approach"].sum()

235

In [52]:
result.loc[result["fields"].ne("No data.")]["second_approach"].value_counts(normalize=True)

0    0.984526
1    0.015474
Name: second_approach, dtype: float64

In [61]:
result.loc[result["fields"].ne("No data.")].shape

(15187, 5)

In [47]:
result["third_approach"].sum()

2645

In [53]:
result.loc[result["fields"].ne("No data.")]["third_approach"].value_counts(normalize=True)

0    0.825838
1    0.174162
Name: third_approach, dtype: float64

In [13]:
result.to_csv("../data/LENS-API-RESPONSE.csv", index=False)

In [55]:
merged = pd.merge(
    df, result,
    how="left",
    on="doi"
)

In [60]:
merged.to_csv("../data/merged.csv", index=False)