<a href="https://colab.research.google.com/github/ipeirotis-org/scholar_v2/blob/main/Calculate_Percentile.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install -q scholarly


In [None]:
import pandas as pd
import numpy as np

In [None]:
# This normalizes the score of each paper to be a number between 0-1, based on age of #citations
url = 'https://raw.githubusercontent.com/ipeirotis/scholar_v2/main/percentiles.csv'
percentile_df = pd.read_csv(url).set_index('age')
percentile_df.head()

In [None]:
url = 'https://raw.githubusercontent.com/ipeirotis-org/scholar_v2/main/author_numpapers_percentiles.csv'
papers_df = pd.read_csv(url).set_index('years_since_first_pub')
papers_df.head()

In [None]:
percentile_df.columns = [float(p) for p in percentile_df.columns]
papers_df.columns = [float(p) for p in papers_df.columns]

In [None]:
# For example, for a paper to score at 99.1% percentile, at year 5 it needs at least 364 citations

year = 5
percentile = 99.1
citations = percentile_df.loc[year,percentile]
print(f"The number of citations necessary to score at the {percentile} percentile at year {year} is {citations}")


In [None]:
from datetime import datetime
from scholarly import scholarly
import json
import logging

def get_scholar_data(author_name):
    try:
        # Query for author and fill in the details
        search_query = scholarly.search_author(author_name)
        author = scholarly.fill(next(search_query))
    except Exception:
        logging.exception("Error getting data from Google Scholar")
        return None, None

    # We want to keep track of the last time we updated the file
    now = datetime.now()
    timestamp = int(datetime.timestamp(now))
    date_str = now.strftime("%Y-%m-%d %H:%M:%S")

    # Bookkeeping with publications
    publications = []
    for pub in author["publications"]:
        pub["citedby"] = pub.pop("num_citations")
        pub["last_updated_ts"] = timestamp
        pub["last_updated"] = date_str

        publications.append(pub)

    # Add last-updated information in the dictionary
    author["last_updated_ts"] = timestamp
    author["last_updated"] = date_str
    # Remove the publications entries, which are not needed in the JSON
    del author["publications"]

    return author, publications

In [None]:
def score_papers(row):
    age, citations = row['age'], row['citations']

    # If the exact age isn't present, find the nearest age in the percentile_df
    if age not in percentile_df.index:
        nearest_age = percentile_df.index[np.abs(percentile_df.index - age).argmin()]
    else:
        nearest_age = age

    # Get the percentiles for the given (or nearest) age
    percentiles = percentile_df.loc[nearest_age]

    # If the paper's citations is less than the minimum percentile, return 0 percentile
    if citations <= percentiles.min():
        return 0.0
    # If the paper's citations is greater than the maximum percentile, return 100 percentile
    elif citations >= percentiles.max():
        return 100.0
    else:
        # Find the two closest percentiles
        below = percentiles[percentiles <= citations].idxmax()
        above = percentiles[percentiles >= citations].idxmin()

        # Interpolate the score (or simply use the closest percentile)
        if above == below:
            return above
        else:
            # Linear interpolation
            lower_bound = percentiles[below]
            upper_bound = percentiles[above]
            weight = (citations - lower_bound) / (upper_bound - lower_bound)
            return below + weight * (above - below)

In [None]:
def get_numpaper_percentiles(year):
  # How to translate the values from raw number of papers to percentiles:

  s = papers_df.loc[year,:]

  # Group by the series values, sort the indices, and pick the last index for each value
  highest_indices = s.groupby(s).apply(lambda x: x.index[-1])

  # Create a new series from this
  sw = pd.Series(index=highest_indices.values, data=highest_indices.index)

  normalized_values = pd.Series(data=sw.index, index=sw.values)

  return normalized_values

def find_closest(series, number):
  # Calculate the absolute difference between each value in the series and the input number
  differences = np.abs(series.index - number)

  # Find the index of the smallest difference
  closest_index = differences.argmin()

  # Return the corresponding value from the series
  return series.iloc[closest_index]


In [None]:

def get_author_statistics(author_name):
    author, publications = get_scholar_data(author_name)

    pubs = [
        {
            "citations": p['citedby'],
            "age": 2024-int(p['bib'].get('pub_year'))+1,
            "title":  p['bib'].get('title')
        }
        for p in publications
          if p['bib'].get('pub_year') is not None and p['citedby']>0
        ]

    query = pd.DataFrame(pubs)

    query['percentile_score'] = query.apply(score_papers, axis=1)
    query['percentile_score'] = query['percentile_score'].round(2)
    query['paper_rank'] = query['percentile_score'].rank(ascending=False, method='first')
    query['paper_rank'] = query['paper_rank'].astype(int)

    year = query['age'].max()
    num_papers_percentile = get_numpaper_percentiles(year)
    query['num_papers_percentile'] = query['paper_rank'].apply(lambda x: find_closest(num_papers_percentile, x))

    query['num_papers_percentile'] = query['num_papers_percentile'].astype(float)

    query = query.sort_values('percentile_score', ascending=False)

    return query

# Individual author analysis

In [None]:
author_name = "Panos Ipeirotis"
result = get_author_statistics(author_name)
result

In [None]:
result.plot.scatter(x='num_papers_percentile', y='percentile_score', c='age', cmap='Blues_r', s=2, grid=True, xlim=(0,100), ylim=(0,100), figsize=(8, 8))


In [None]:
auc = result.filter(['num_papers_percentile', 'percentile_score']).drop_duplicates(subset='num_papers_percentile', keep='first')
auc_score = np.trapz(auc['percentile_score'], auc['num_papers_percentile'])

print(f"AUC score: {(auc_score/(100*100)):.4f}")

# Comparative author analysis

In [None]:
authors = [
    "Andrew Ng", "Jon Kleinberg", "Yann Lecun",   "Eric Horvitz", #  "Jure Leskovec", #
]

results = dict()
for author in authors:
  results[author] = get_author_statistics(author)

In [None]:
to_join = [
    results[author]
    .filter(['num_papers_percentile', 'percentile_score'])
    .rename(columns={'percentile_score': author})
    .drop_duplicates(subset='num_papers_percentile', keep='first')
    .set_index('num_papers_percentile')
    for author in authors]
matched = pd.concat(to_join, axis=1).sort_index()
matched.ffill(inplace=True)

In [None]:
matched.plot.line(figsize=(6, 6), grid=True, xlim=(0,100), ylim=(0,100), lw=1)
