Let's get the end cites\publications per year and coauthors of each author. Note that code is semi supervised and should be stopped by the user when we get all the projects.

In [None]:
!pip install scholarly

import pandas as pd
from tqdm import tqdm
from scholarly import scholarly


cites_per_year = []
publications_per_year = []
coauthors = []

df = pd.read_csv("https://raw.githubusercontent.com/leorrose/ERC_analysis/main/data/erc_data.csv", delimiter=';')


for _, pi_name in tqdm(df.PI.items(), total=df.PI.shape[0]):
    # Seacrch for PI
    search_query = scholarly.search_author(pi_name)
    # Get first result of search
    try:
        author_result = next(search_query)
        # Get all information
        author_info = scholarly.fill(author_result)
        # Ger cites per year
        pi_cites_per_year = pd.Series(author_info["cites_per_year"], name="cites_per_year").to_frame().reset_index(names="year")
        pi_cites_per_year["author"] = pi_name
        cites_per_year.append(pi_cites_per_year)
        # Get coauthors
        pi_coauthors = []
        for coauthor in author_info["coauthors"]:
            pi_coauthors.append([pi_name, author_info["scholar_id"] ,coauthor["scholar_id"], coauthor["name"]])
        coauthors.append(pd.DataFrame(pi_coauthors, columns=["author_name", "author_id", "coauthor_name", "coauthor_id"]))
        # Get publications per year
        pi_publications_per_year = {}
        for publication in author_info["publications"]:
            year = publication.get("bib", {}).get("pub_year")
            if year is not None:
                pi_publications_per_year[year] = pi_publications_per_year.get(year, 0) + 1
        pi_publications_per_year = pd.Series(pi_publications_per_year, name="publications_per_year").to_frame().reset_index(names="year")
        pi_publications_per_year["author"] = pi_name
        publications_per_year.append(pi_publications_per_year)
    except StopIteration as e:
        print(f"{pi_name} - StopIteration error")

Let's save the results:

In [None]:
pd.concat(cites_per_year).to_csv("cites_per_year.csv", index=False)
pd.concat(publications_per_year).to_csv("publications_per_year.csv", index=False)
pd.concat(coauthors).to_csv("coauthors.csv", index=False)