<a href="https://colab.research.google.com/github/ipeirotis/scholar_v2/blob/main/Calculate_Percentile.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install -q scholarly


In [None]:
import pandas as pd
import numpy as np

In [None]:
url = 'https://raw.githubusercontent.com/ipeirotis/scholar_v2/main/percentiles.csv'
percentile_df = pd.read_csv(url).set_index('age')
percentile_df.head()


In [None]:
percentile_df.columns = [float(p) for p in percentile_df.columns]

In [None]:
# For example, for a paper to score at 99.1% percentile, at year 5 it needs at least 364 citations

year = 5
percentile = 99.1
citations = percentile_df.loc[year,percentile]
print(f"The number of citations necessary to score at the {percentile} percentile at year {year} is {citations}")


In [None]:
from datetime import datetime
from scholarly import scholarly
import json
import logging

def get_scholar_data(author_name):
    try:
        # Query for author and fill in the details
        search_query = scholarly.search_author(author_name)
        author = scholarly.fill(next(search_query))
    except Exception:
        logging.exception("Error getting data from Google Scholar")
        return None, None

    # We want to keep track of the last time we updated the file
    now = datetime.now()
    timestamp = int(datetime.timestamp(now))
    date_str = now.strftime("%Y-%m-%d %H:%M:%S")

    # Bookkeeping with publications
    publications = []
    for pub in author["publications"]:
        pub["citedby"] = pub.pop("num_citations")
        pub["last_updated_ts"] = timestamp
        pub["last_updated"] = date_str

        publications.append(pub)

    # Add last-updated information in the dictionary
    author["last_updated_ts"] = timestamp
    author["last_updated"] = date_str
    # Remove the publications entries, which are not needed in the JSON
    del author["publications"]

    return author, publications

In [None]:
def score_papers(row):
    age, citations = row['age'], row['citations']

    # If the exact age isn't present, find the nearest age in the percentile_df
    if age not in percentile_df.index:
        nearest_age = percentile_df.index[np.abs(percentile_df.index - age).argmin()]
    else:
        nearest_age = age

    # Get the percentiles for the given (or nearest) age
    percentiles = percentile_df.loc[nearest_age]

    # If the paper's citations is less than the minimum percentile, return 0 percentile
    if citations <= percentiles.min():
        return 0.0
    # If the paper's citations is greater than the maximum percentile, return 100 percentile
    elif citations >= percentiles.max():
        return 100.0
    else:
        # Find the two closest percentiles
        below = percentiles[percentiles <= citations].idxmax()
        above = percentiles[percentiles >= citations].idxmin()

        # Interpolate the score (or simply use the closest percentile)
        if above == below:
            return above
        else:
            # Linear interpolation
            lower_bound = percentiles[below]
            upper_bound = percentiles[above]
            weight = (citations - lower_bound) / (upper_bound - lower_bound)
            return below + weight * (above - below)

In [None]:
def get_author_statistics(author_name):
    author, publications = get_scholar_data(author_name)

    pubs = [
        {
            "citations": p['citedby'],
            "age": 2023-int(p['bib'].get('pub_year'))+1,
            "title":  p['bib'].get('title')
        }
        for p in publications
          if p['bib'].get('pub_year') is not None and p['citedby']>0
        ]

    query = pd.DataFrame(pubs)

    query['percentile_score'] = query.apply(score_papers, axis=1)
    query['percentile_score'] = query['percentile_score'].round(2)
    query['paper_rank'] = query['percentile_score'].rank(ascending=False, method='first')
    query['paper_rank'] = query['paper_rank'].astype(int)


    query = query.sort_values('percentile_score', ascending=False)

    return query

# Individual author analysis

In [None]:
author_name = "Panos Ipeirotis"
result = get_author_statistics(author_name)
result

In [None]:
result.hist(bins=20)

In [None]:
result.plot.scatter(x='paper_rank', y='percentile_score', c='age', cmap='Blues_r', s=2, figsize=(10, 6))

# Comparative author analysis

In [None]:
authors = [
    "Andrew Ng", "Jon Kleinberg", "Jure Leskovec", "Eric Horvitz", "Yann Lecun"
]

results = dict()
for author in authors:
  results[author] = get_author_statistics(author)

In [None]:
to_join = [results[author].filter(['paper_rank', 'percentile_score']).rename(columns={'percentile_score': author}).set_index('paper_rank') for author in authors]
matched = pd.concat(to_join, axis=1).sort_index()
matched

In [None]:
matched.plot.line(figsize=(10, 6), grid=True)

In [None]:
matched.mean().round(2)