<a href="https://colab.research.google.com/github/ipeirotis-org/datasets/blob/main/Percentiles_for_Publications.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
from tqdm import tqdm
from tqdm.auto import tqdm
tqdm.pandas()

In [None]:
# Getting raw data from https://github.com/sentian/SciImpactRanking

url = 'https://media.githubusercontent.com/media/sentian/SciImpactRanking/master/data/raw/citations.csv'

benchmark_df = pd.read_csv(url)

In [None]:
benchmark_df

# Citation percentiles for papers over time

In [None]:
pvt = benchmark_df.pivot_table(index='pub.id', columns='age', values='citations').T.cumsum(skipna=True)

last_valid_indices = pvt.progress_apply(pd.Series.last_valid_index)

# Forward fill the entire dataframe
pvt.ffill(inplace=True)

# Replace values beyond the last valid index with NaNs for each column
for col, last_valid_idx in tqdm(last_valid_indices.items(), total=len(pvt.columns)):
    if last_valid_idx is not None and last_valid_idx != pvt.index[-1]:  # Check to avoid unnecessary operations
        pvt.loc[last_valid_idx+1:, col] = np.nan

# Define the percentiles to compute
percentiles = np.linspace(0, 100, 1000)

# Compute percentiles for each row
percentile_df = pvt.apply(lambda row: np.percentile(row.dropna(), percentiles), axis=1, result_type='expand')
percentile_df.columns = [str(round(i,1)) for i in percentiles]
percentile_df = percentile_df.round(0).astype(int)


In [None]:
percentile_df.to_csv('percentiles.csv')

In [None]:
# This is our benchmark dataframe. For each year, it contains the number of citations for the different percentiles
percentile_df

In [None]:
# For example, for a paper to score at 99.1% percentile, at year 5 it needs at least 364 citations

year = 5
percentile = '99.1'
citations = percentile_df.loc[year,percentile]
print(f"The number of citations necessary to score at the {percentile} percentile at year {year} is {citations}")


# Percentiles for number of publications over time



In [None]:
author_df = benchmark_df.query("age==1").filter(["aut.id", "pub.id", "start"])
author_df.columns = ["author", "publication", "year"]
author_df.reset_index(drop=True, inplace=True)
author_df

In [None]:
# Step 1: Identify first publication year for each author
first_pub_year = author_df.groupby('author')['year'].min().rename('first_pub_year')

# Step 2: Merge this information back to the original DataFrame
author_df = author_df.merge(first_pub_year, on='author')

# Step 3: Calculate years since first publication and aggregate data
author_df['years_since_first_pub'] = author_df['year'] - author_df['first_pub_year']

In [None]:
author_pvt = author_df.pivot_table(
    index='years_since_first_pub',
    columns='author',
    values='publication',
    aggfunc='count'
).cumsum().ffill()

In [None]:
# Compute percentiles for each row
percentile_df = author_pvt.apply(lambda row: np.percentile(row.dropna(), percentiles), axis=1, result_type='expand')
percentile_df.columns = [str(round(i,1)) for i in percentiles]
percentile_df = percentile_df.round(0).astype(int)

percentile_df

In [None]:

# swapped_series_with_highest_index

In [None]:
percentile_df.to_csv('author_numpapers_percentiles.csv')

## Normalization examples

In [None]:
# For example, for an author, after 25 years of publishing to be at 95% percentile
# in terms of number of papers, they need 202 papers

year = 23
percentile = '93.5'
citations = percentile_df.loc[year,percentile]
print(f"The number of publications necessary to score at the {percentile} percentile at year {year} is {citations}")


In [None]:
# How to translate the values from raw number of papers to percentiles:

s = percentile_df.loc[year,:]

# Group by the series values, sort the indices, and pick the last index for each value
highest_indices = s.groupby(s).apply(lambda x: x.index[-1])

# Create a new series from this
sw = pd.Series(index=highest_indices.values, data=highest_indices.index)

normalized_values = pd.Series(data=sw.index, index=sw.values)

normalized_values.tail(60)

In [None]:
percs_of_interest = ["50.0","75.0","90.0","95.0","97.5","99.0","99.5","99.8","99.9"]

In [None]:
percentile_df.filter(percs_of_interest).plot(xlim=(0,40),grid=True)


In [None]:
percentile_df.filter(percs_of_interest).diff(1).mean()