In [None]:
import json
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd

Papers with code datasets.json downloaded on September 17, 2024 from https://paperswithcode.com/about.
Licensed under the CC BY-SA licence.

# Load Paper with Code dataset metadata

In [None]:
with open('../../../../Papers with code datasets/datasets.json') as f:
    datasets_md = json.load(f)

In [None]:
datasets_df = pd.DataFrame(datasets_md)
datasets_df.head()

In [None]:
datasets_df.columns

In [None]:
datasets_df.info()

## Filter for text benchmarks

In [None]:
unique_modalities = []
text_dataset_ids = []
benchmark_ids = []
for i, row in datasets_df.iterrows():
    for value in row["modalities"]:
        if value not in unique_modalities:
            unique_modalities += [value]
    if "Texts" in row["modalities"]:
        text_dataset_ids += [i]
    if "benchmark" in row["description"].lower():
        benchmark_ids += [i]
print(unique_modalities)
print("Number of identified text datasets:", len(text_dataset_ids))
print("Number of identified benchmarks:", len(benchmark_ids))


Todo: ensure that keyword "benchmark" gets all benchmarks/evaluation datasets

In [None]:
text_benchmarks_df = datasets_df.iloc[list(set(text_dataset_ids))]
text_benchmarks_df.head()

In [None]:
# text_benchmarks_df = datasets_df.iloc[list(set(text_dataset_ids) & set(benchmark_ids))]
# text_benchmarks_df.head()

## Detangle task & language info

In [None]:
unique_tasks = []
unique_languages = []

# do one after another for sorted columns
for i, row in text_benchmarks_df.iterrows():
    tasks = row["tasks"]
    for task in tasks:
        task_name = task["task"]
        if task_name not in unique_tasks:
            unique_tasks += [task_name]
            text_benchmarks_df = text_benchmarks_df.assign(**{task_name : [0] * len(text_benchmarks_df)})
        text_benchmarks_df.loc[i, task_name] = 1

for i, row in text_benchmarks_df.iterrows():
    languages = row["languages"]
    for language in languages:
        if language not in unique_languages:
            unique_languages += [language]
            text_benchmarks_df = text_benchmarks_df.assign(**{language : [0] * len(text_benchmarks_df)})
        text_benchmarks_df.loc[i, language] = 1

text_benchmarks_df = text_benchmarks_df.assign(num_tasks=text_benchmarks_df[unique_tasks].sum(axis=1))
text_benchmarks_df = text_benchmarks_df.assign(num_languages=text_benchmarks_df[unique_languages].sum(axis=1))

print("Dataframe now has", len(text_benchmarks_df.columns), "columns")

In [None]:
print("Max. number of task per benchmark:", text_benchmarks_df[unique_tasks].sum(axis=1).max())
print("Median number of task per benchmark:", text_benchmarks_df[unique_tasks].sum(axis=1).median())
print("Number of benchmarks with more than two tasks:", (text_benchmarks_df[unique_tasks].sum(axis=1) > 2).sum())

## Filter for popular benchmarks

In [None]:
plt.hist(text_benchmarks_df["num_papers"], bins=50)
plt.gca().set(title='Text benchmark citations', ylabel='Frequency')


In [None]:
text_benchmarks_df["num_papers"].describe()

In [None]:
percentile_table = pd.DataFrame(columns=["percentile", "cutoff", "num_of_benchmarks_included"])
percentile_table.loc[:, "percentile"] = np.arange(start=75, stop=100, step=5)/100
percentile_table.loc[:, "cutoff"] = [text_benchmarks_df["num_papers"].quantile(val) for val in percentile_table["percentile"].values]
percentile_table.loc[:, "num_of_benchmarks_included"] = [sum(text_benchmarks_df["num_papers"] > val) for val in percentile_table["cutoff"].values]
percentile_table

In [None]:
# Selecting 85% percentile
popular_text_benchmarks_df = text_benchmarks_df.loc[text_benchmarks_df["num_papers"] > percentile_table.loc[percentile_table["percentile"] == .95, "cutoff"].values[0]]
len(popular_text_benchmarks_df)

In [None]:
popular_text_benchmarks_df.head()

## Plot over time

### Only popular

In [None]:
popular_text_benchmarks_df.loc[:,"introduced_date"] = pd.to_datetime(popular_text_benchmarks_df["introduced_date"])
temp_df = popular_text_benchmarks_df[["introduced_date", "num_papers"]].dropna()
temp_df.info()

In [None]:
sns.set_theme(rc={"figure.figsize":(6, 5)}) #width=3, #height=4
plt.plot_date(temp_df["introduced_date"].values, temp_df["num_papers"].values)
plt.ylabel("Number of citations", fontsize=15)
plt.xlabel("Data of introduction", fontsize=15)
plt.tick_params(labelsize=15)

### All

In [None]:
text_benchmarks_df.loc[:,"introduced_date"] = pd.to_datetime(text_benchmarks_df["introduced_date"])
temp_df = text_benchmarks_df[["introduced_date", "num_papers"]].dropna()
temp_df.info()

In [None]:
sns.set_theme(rc={"figure.figsize":(6, 5)}) #width=3, #height=4
plt.plot_date(temp_df["introduced_date"].values, temp_df["num_papers"].values)
plt.ylabel("Number of citations", fontsize=15)
plt.xlabel("Data of introduction", fontsize=15)
plt.tick_params(labelsize=15)

In [None]:
popular_text_benchmarks_df.loc[popular_text_benchmarks_df["num_papers"]>2000, "name"]

In [None]:
popular_text_benchmarks_df.loc[popular_text_benchmarks_df["num_papers"]>1000, ["name", "num_papers"]]

## Tasks & Languages

In [None]:
popular_text_benchmarks_df.loc[popular_text_benchmarks_df[unique_tasks].sum(axis=1)>100, ["name", "num_tasks"]]

In [None]:
sns.set_theme(rc={"figure.figsize":(5, 8)}) #width=3, #height=4
temp_df = pd.DataFrame(columns=["name", "num_tasks"])
temp_df["name"] = popular_text_benchmarks_df["name"]
temp_df["num_tasks"] = popular_text_benchmarks_df["num_tasks"]
temp_df = temp_df.sort_values(by="num_tasks", ascending=False)[:20]

ax = sns.barplot(x=temp_df["num_tasks"] , y=temp_df["name"])
ax.set_title('Number of tasks per benchmark', fontdict={'size': 15})

In [None]:
sns.set_theme(rc={"figure.figsize":(5, 8)}) #width=3, #height=4
temp_df = pd.DataFrame(columns=["name", "num_languages"])
temp_df["name"] = popular_text_benchmarks_df["name"]
temp_df["num_languages"] = popular_text_benchmarks_df["num_languages"]
temp_df = temp_df.sort_values(by="num_languages", ascending=False)[:20]

ax = sns.barplot(x=temp_df["num_languages"] , y=temp_df["name"])
ax.set_title('Number of languages per benchmark', fontdict={'size': 15})

In [None]:
sns.set_theme(rc={"figure.figsize":(5, 50)}) #width=3, #height=4
temp_df = pd.DataFrame(columns=["task_name", "pcnt_benchmarks"])
temp_df["task_name"] = unique_tasks
temp_df["pcnt_benchmarks"] = popular_text_benchmarks_df[unique_tasks].sum().values / len(popular_text_benchmarks_df) * 100
temp_df = temp_df.sort_values(by="pcnt_benchmarks", ascending=False)

ax = sns.barplot(x=temp_df.loc[:50, "pcnt_benchmarks"] , y=temp_df.loc[:50, "task_name"])
ax.set_title('Percentage of popular benchmarks covering this task', fontdict={'size': 15})


In [None]:
sns.set_theme(rc={"figure.figsize":(5, 8)}) #width=3, #height=4
ax = sns.barplot(x=temp_df["pcnt_benchmarks"][:20] , y=temp_df["task_name"][:20])
ax.set_xlabel("Percentage of popular benchmarks covering this task", fontsize=15)

In [None]:
sns.set_theme(rc={"figure.figsize":(5, 8)}) #width=3, #height=4
temp_df = pd.DataFrame(columns=["language", "pcnt_benchmarks"])
temp_df["language"] = unique_languages
temp_df["pcnt_benchmarks"] = popular_text_benchmarks_df[unique_languages].sum().values / len(popular_text_benchmarks_df) * 100
temp_df = temp_df.sort_values(by="pcnt_benchmarks", ascending=False)
ax = sns.barplot(x=temp_df["pcnt_benchmarks"][:20] , y=temp_df["language"][:20])
ax.set_xlabel("Percentage of popular benchmarks including this language", fontsize=15)

## Exploring list of most popular benchmarks

In [None]:
print("Benchmarks sorted by number of papers")
popular_text_benchmarks_df.loc[:, ["name", "tasks", "num_papers", "num_tasks", "num_languages"]].sort_values(by="num_papers", ascending=False)

In [None]:
sns.set_theme(rc={"figure.figsize":(5, 30)}) #width=3, #height=4
ax = sns.barplot(x='num_papers', y='name', data=popular_text_benchmarks_df.sort_values(by="num_papers", ascending=False))
#ax.tick_params(axis='x', rotation=90)
ax.set_xlabel('Number of papers cited by', fontdict={'size': 15})


In [None]:
non_qa_text_benchmarks = text_benchmarks_df.loc[text_benchmarks_df["Question Answering"] != 1].sort_values(by="num_papers", ascending=False)

In [None]:
sns.set_theme(rc={"figure.figsize":(5, 8)}) #width=3, #height=4
ax = sns.barplot(x='num_papers', y='name', data=non_qa_text_benchmarks[:20])
#ax.tick_params(axis='x', rotation=90)
ax.set_xlabel('Number of papers cited by', fontdict={'size': 15})

### Most popular QA benchmarks only

In [None]:
qa_text_benchmarks = text_benchmarks_df.loc[text_benchmarks_df["Question Answering"] == 1].sort_values(by="num_papers", ascending=False)

In [None]:
percentile_table_qa = pd.DataFrame(columns=["percentile", "cutoff", "num_of_benchmarks_included"])
percentile_table_qa.loc[:, "percentile"] = np.arange(start=50, stop=100, step=5)/100
percentile_table_qa.loc[:, "cutoff"] = [qa_text_benchmarks["num_papers"].quantile(val) for val in percentile_table_qa["percentile"].values]
percentile_table_qa.loc[:, "num_of_benchmarks_included"] = [sum(qa_text_benchmarks["num_papers"] > val) for val in percentile_table_qa["cutoff"].values]
percentile_table_qa

In [None]:
# Selecting 85% percentile
popular_qa_text_benchmarks = qa_text_benchmarks.loc[qa_text_benchmarks["num_papers"] >= percentile_table_qa.loc[percentile_table_qa["percentile"] == .90, "cutoff"].values[0]]
len(popular_qa_text_benchmarks)

In [None]:
sns.set_theme(rc={"figure.figsize":(5, 8)}) #width=3, #height=4
ax = sns.barplot(x='num_papers', y='name', data=popular_qa_text_benchmarks)
#ax.tick_params(axis='x', rotation=90)
ax.set_xlabel('Number of papers cited by', fontdict={'size': 15})