In [1]:
import requests
import json
import time
import pandas as pd

# Data

In [2]:
def scrape_dois(doi_list):
    base_url = 'https://api.crossref.org/works/'
    combined_data = []
    failed_dois = []

    for doi in doi_list:
        url = base_url + doi
        response = requests.get(url)

        if response.status_code == 200:
            data = response.json()
            combined_data.append(data)
            title = data['message']['title'][0]
            # print(f"DOI: {doi}\nTitle: {title}\n")
        else:
            failed_dois.append(doi)
            print(f"Failed to retrieve data for DOI: {doi}\n")

        # Delay between API requests

    # Save combined JSON output to a file
    with open("publications.json", "w") as file:
        json.dump(combined_data, file, indent=4)
    print("Combined JSON output saved to: combined_output.json")

    return combined_data, failed_dois


In [3]:
df = pd.read_csv('data/dois.csv', header=0)
dois = df['DOI'].unique().tolist()

In [4]:
len(set(dois))

402

In [8]:
table = []
for paper in data:
    row = {}
    row["title"] = paper['message']['title'][0]
    row["doi"] = paper['message']['DOI']
    row["authors"] = paper['message']['author']

    if paper['message']["type"] == "journal-article":
        continue
    elif paper['message']["type"] == "book-chapter":
        row['acronym'] = paper['message']["container-title"][1]
        
    else:
        row["acronym"] = paper['message']['event']["acronym"].split(" ")[0]
        
    row["year"] = paper['message']['created']['date-parts'][0][0]
    table.append(row)


In [9]:
df = pd.DataFrame(table)

### Gephi Export

In [50]:
df["names"] = df["authors"].apply(lambda x: ", ".join([author.get("given", "") + " " + author.get("family", "") for author in x]))

In [52]:
df["names"].to_csv("export/gephi.csv", index=False)

### Json Export

In [48]:
with open("export/IR_Institute_metadata.json", "w") as file:
    json.dump(table, file)