# Annotations

In [None]:
import os
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib
import matplotlib.cm as cm
import colorcet as cc
from holoviews.plotting.bokeh.styles import font_size
from sklearn.metrics import cohen_kappa_score

In [None]:
annotations_folder = "."

## Plotting helpers

In [None]:
cmap = matplotlib.colors.ListedColormap(cc.cm.glasbey.colors[5:])
palette = lambda n : sns.color_palette(cc.glasbey, n_colors=n)

In [None]:
def map_list_to_color(lst):
    colors = cc.cm.glasbey.colors[5:len(lst)+5]
    map = dict(zip(lst, colors))
    return map

# Overview of benchmarks

In [None]:
benchmarks_overview = pd.read_excel(os.path.join(annotations_folder, "qa_benchmarks_overview.xlsx"))
benchmarks_overview["Year"].describe()

In [None]:
benchmarks_overview["Year"].hist()

# Analysis of annotations

## Load

In [None]:

annotations_survey_results = pd.read_csv(os.path.join(annotations_folder, "results-survey753164-corrected-readable-titles.tsv"), sep="\t")
annotations_survey_results.head()

In [None]:
annotations_survey_results = annotations_survey_results.loc[~annotations_survey_results["bench_abbrev"].isin(["newsvqa", "newskvqa"]), :]
annotations_survey_results = annotations_survey_results.replace({"naturalq": "naturalquestions", "thruthfulqa": "truthfulqa"})

In [None]:
open_cols = [col for col in annotations_survey_results.columns if "comment" in col] + ["bench_abbrev", "benchname", "institution", "source_concrete", "benchtype", "lang", "id"]
yes_no_cols = [col for col in annotations_survey_results.columns if col not in open_cols]
yes_no_wo_other = [col for col in yes_no_cols if "other" not in col]

In [None]:
(annotations_survey_results["bench_abbrev"].unique())

In [None]:
assert len(annotations_survey_results["bench_abbrev"].unique()) == 30

In [None]:
pd.set_option("future.no_silent_downcasting", True)
#annotations_survey_results = annotations_survey_results.replace({"Y": 1, "N": 0})
annotations_survey_results.loc[:, yes_no_cols] = annotations_survey_results.loc[:, yes_no_cols].fillna("N")

## Interannotator agreement

In [None]:
kappas = []
for bench in annotations_survey_results["bench_abbrev"].unique():
    annotations_1_and_2 = annotations_survey_results.loc[annotations_survey_results["bench_abbrev"] == bench, yes_no_wo_other]
    kappa = cohen_kappa_score(annotations_1_and_2.iloc[0], annotations_1_and_2.iloc[1])
    print(bench, kappa)
    kappas += [kappa]


In [None]:
np.mean(kappas) # Interannotator agreement w/o "other" category

In [None]:
np.std(kappas)

In [None]:
annotations_survey_results = annotations_survey_results.replace({"Y": 1, "N": 0})

In [None]:
annotations_survey_results[annotations_survey_results["id"] == 36]

In [None]:
external_annotations = annotations_survey_results[annotations_survey_results["id"] <= 36].reset_index()
internal_annotations = annotations_survey_results[annotations_survey_results["id"] > 36].reset_index()
assert len(external_annotations) == len(internal_annotations)

In [None]:
benchmarks_wo_human_anno = internal_annotations.loc[internal_annotations["anno_how_human"] == 0, "bench_abbrev"].values
benchmarks_wo_human_anno

In [None]:
#value_counts_internal_external = pd.DataFrame(columns=["item", "internal_0", "external_0","internal_1", "external_1"])
vc_int_ext_dict = {"item": [], "internal_0": [], "external_0": [],"internal_1": [], "external_1": []}
for col in yes_no_cols:
    vc_int_ext_dict["item"] += [col]
    if any(map(col.__contains__, ["identity", "recruitment"])):
        # only look at annotator details for benchmarks that involve human annotation
        i = internal_annotations.loc[~internal_annotations["bench_abbrev"].isin(benchmarks_wo_human_anno), col].value_counts()
        e = external_annotations.loc[~internal_annotations["bench_abbrev"].isin(benchmarks_wo_human_anno), col].value_counts()
    else:
        i = internal_annotations.loc[:, col].value_counts()
        e = external_annotations.loc[:, col].value_counts()
    vc_int_ext_dict["internal_0"] += [i[0]] if 0 in i else [0]
    vc_int_ext_dict["internal_1"] += [i[1]] if 1 in i else [0]
    vc_int_ext_dict["external_0"] += [e[0]] if 0 in e else [0]
    vc_int_ext_dict["external_1"] += [e[1]] if 1 in e else [0]
vc_int_ext_df = pd.DataFrame(vc_int_ext_dict)
vc_int_ext_df

### Detailed analysis of internal and external annotation differences

In [None]:
def print_diff_for_col(col):
    for b in internal_annotations["bench_abbrev"].unique():
        int = internal_annotations.loc[internal_annotations["bench_abbrev"] == b, col].values
        ext = external_annotations.loc[external_annotations["bench_abbrev"] == b, col].values
        if int != ext:
            print(b)
            print("Internal: ", int,
                  internal_annotations.loc[internal_annotations["bench_abbrev"] == b, col + "_comment"].values)
            print("External: ", ext,
                  external_annotations.loc[external_annotations["bench_abbrev"] == b, col + "_comment"].values)

In [None]:
for col in yes_no_cols:
    print(col)
    print_diff_for_col(col)


## Reported annotator demographics

In [None]:
identity_demographic_color_map = map_list_to_color([col for col in internal_annotations.columns if "identity_demographic" in col and not "comment" in col])

In [None]:
internal_annotations.loc[internal_annotations["identity_demographic_none"] == 0, ["bench_abbrev"]+[col for col in internal_annotations.columns if "identity_demographic" in col]]

In [None]:
external_annotations.loc[external_annotations["identity_demographic_none"] == 0, ["bench_abbrev"]+[col for col in internal_annotations.columns if "identity_demographic" in col]]

In [None]:
data = vc_int_ext_df.loc[(vc_int_ext_df["item"].str.contains("identity_demographic")) & (vc_int_ext_df["internal_1"] > 0), ["item", "internal_1"]]
data = data.sort_values(by="internal_1")
plt.figure(figsize=(6,6))
sns.set_style("whitegrid")
relabel_map = {"edu": "education", "domain": "area of expertise", "recruitment_country": "recruitment country", "origin_country": "country of origin"}
x = data.internal_1
colors = [identity_demographic_color_map[l] for l in data.item]
labels = [label.replace("identity_demographic_", "") for label in data.item]
labels = [relabel_map[l] if l in relabel_map else l for l in labels]
patches, texts, autotexts = plt.pie(x, labels=labels, autopct=(lambda p: '{:.0f}'.format(p * 30 / 100)), colors=colors)
for txt in texts:
    txt.set_fontsize(16)
for autotext in autotexts:
    autotext.set_color('white')
    autotext.set_fontsize(18)
# Add a title
plt.title("Reported demographic information about annotators", fontsize=20)
plt.get_figlabels()
# Display the plot
plt.savefig(os.path.join(annotations_folder, "images", 'identity_demographic.pdf'), format='pdf', dpi=300, bbox_inches='tight')

In [None]:
data = vc_int_ext_df.loc[(vc_int_ext_df["item"].str.contains("identity_demographic")) & (vc_int_ext_df["external_1"] > 0), ["item", "external_1"]]
data = data.sort_values(by="external_1")
# Create our pie chart with labels
plt.figure(figsize=(6,6))
sns.set_style("whitegrid")
relabel_map = {"edu": "education", "domain": "area of expertise", "recruitment_country": "recruitment country", "origin_country": "country of origin"}
x = data.external_1
colors = [identity_demographic_color_map[l] for l in data.item]
labels = [label.replace("identity_demographic_", "") for label in data.item]
labels = [relabel_map[l] if l in relabel_map else l for l in labels]
patches, texts, autotexts = plt.pie(x, labels=labels, autopct=(lambda p: '{:.0f}'.format(p * 30 / 100)), colors=colors)
for txt in texts:
    txt.set_fontsize(16)
for autotext in autotexts:
    autotext.set_color('white')
    autotext.set_fontsize(18)
# Add a titletotal
plt.title("Reported demographic information about annotators\n(external annotations)", fontsize=20)

# Display the plot
plt.savefig(os.path.join(annotations_folder, "images", 'identity_demographic_ext.pdf'), format='pdf', dpi=300, bbox_inches='tight')

## Language of benchmark

In [None]:
internal_annotations.loc[internal_annotations["lang"] != "English", "bench_abbrev"]

In [None]:
external_annotations.loc[external_annotations["lang"] != "English", "bench_abbrev"]

In [None]:
external_annotations.loc[external_annotations["bench_abbrev"] == "okvqa", "lang"]


## How was the data annotated?

In [None]:
internal_annotations["anno_how_human"].value_counts()

In [None]:
external_annotations["anno_how_human"].value_counts()

In [None]:
internal_annotations.loc[internal_annotations["anno_how_human"] == 0, "bench_abbrev"]

In [None]:
external_annotations.loc[external_annotations["anno_how_human"] == 0, "bench_abbrev"]

In [None]:
external_annotations.loc[external_annotations["bench_abbrev"].isin(["arc", "scienceqa", "xquad"]), "anno_how_human_comment"].values

### Detailed look at anntotator disagreement

In [None]:
print_diff_for_col("anno_how_human")

## Recruitment criteria

In [None]:
internal_annotations.loc[(internal_annotations["recruitment_criteria_none"] == 0) & (internal_annotations["anno_how_human"] == 1), "bench_abbrev"]

In [None]:
internal_annotations.loc[(internal_annotations["recruitment_criteria_none"] == 0) & (internal_annotations["anno_how_human"] == 1), "bench_abbrev"]

In [None]:
recruitment_criteria_labels = [col for col in internal_annotations.columns if "recruitment_criteria" in col]
recruitment_criteria_labels

In [None]:
internal_annotations.loc[internal_annotations["bench_abbrev"] == "coqa", recruitment_criteria_labels]

In [None]:
internal_annotations.loc[(internal_annotations["anno_how_human"] == 1), ["bench_abbrev"] + recruitment_criteria_labels ]

In [None]:
external_annotations.loc[(external_annotations["recruitment_criteria_none"] == 0) & (external_annotations["anno_how_human"] == 1), ["bench_abbrev"] + recruitment_criteria_labels ]

In [None]:
vc_int_ext_df.loc[vc_int_ext_df["item"].str.contains("recruitment_criteria"), ["item", "external_1"]]

In [None]:
vc_int_ext_df.loc[vc_int_ext_df["item"].str.contains("recruitment_criteria"), ["item", "internal_1"]]

In [None]:
identity_demographic_color_map

In [None]:
recruitment_criteria_color_map = map_list_to_color([col for col in internal_annotations.columns if "recruitment_criteria" in col and not "comment" in col])
recruitment_criteria_color_map

In [None]:
recruitment_criteria_color_map["recruitment_criteria_other"] = identity_demographic_color_map["identity_demographic_other"]

In [None]:
data = vc_int_ext_df.loc[(vc_int_ext_df["item"].str.contains("recruitment_criteria")) & (vc_int_ext_df["internal_1"] > 0), ["item", "internal_1"]]
data = data.sort_values(by="internal_1")
data.loc[data["item"] == "recruitment_criteria_task", "internal_1"] += data.loc[data["item"] == "recruitment_criteria_rank", "internal_1"].values
data = data.loc[data["item"] != "recruitment_criteria_rank", :]

plt.figure(figsize=(6,6))
sns.set_style("whitegrid")
relabel_map = {"available": "availability", "task": "task performance", "expertise": "domain expertise"}
x = data.internal_1
colors = [recruitment_criteria_color_map[l] for l in data.item]
labels = [label.replace("recruitment_criteria_", "") for label in data.item]
labels = [relabel_map[l] if l in relabel_map else l for l in labels]
patches, texts, autotexts = plt.pie(x, labels=labels, autopct=(lambda p: '{:.0f}'.format(p * 30 / 100)), colors=colors)
for txt in texts:
    txt.set_fontsize(16)
for autotext in autotexts:
    autotext.set_color('white')
    autotext.set_fontsize(18)
# Add a title
plt.title("Reported annotator recruitment criteria", fontsize=20)
plt.get_figlabels()
# Display the plot
plt.savefig(os.path.join(annotations_folder, "images", 'recruitment_criteria.pdf'), format='pdf', dpi=300, bbox_inches='tight')

In [None]:
data = vc_int_ext_df.loc[(vc_int_ext_df["item"].str.contains("recruitment_criteria")) & (vc_int_ext_df["external_1"] > 0), ["item", "external_1"]]
data = data.sort_values(by="external_1")
data.loc[data["item"] == "recruitment_criteria_task", "external_1"] += data.loc[data["item"] == "recruitment_criteria_rank", "external_1"].values
data = data.loc[data["item"] != "recruitment_criteria_rank", :]

plt.figure(figsize=(6,6))
sns.set_style("whitegrid")
relabel_map = {"available": "availability", "task": "task performance", "expertise": "domain expertise"}
x = data.external_1
colors = [recruitment_criteria_color_map[l] for l in data.item]
labels = [label.replace("recruitment_criteria_", "") for label in data.item]
labels = [relabel_map[l] if l in relabel_map else l for l in labels]
patches, texts, autotexts = plt.pie(x, labels=labels, autopct=(lambda p: '{:.0f}'.format(p * 30 / 100)), colors=colors)
for txt in texts:
    txt.set_fontsize(16)
for autotext in autotexts:
    autotext.set_color('white')
    autotext.set_fontsize(18)
# Add a title
plt.title("Reported annotator recruitment criteria\n(external annotations)", fontsize=20)
plt.get_figlabels()
# Display the plot
plt.savefig(os.path.join(annotations_folder, "images", 'recruitment_criteria_ext.pdf'), format='pdf', dpi=300, bbox_inches='tight')

## Are the contents made transparent in the reports?

In [None]:
internal_annotations["content_none"].value_counts()

In [None]:
content_labels = [col for col in internal_annotations.columns if "content" in col]
content_labels

In [None]:
for col in content_labels:
    if "comment" not in col:
        print(internal_annotations[col].value_counts())

What contents are found?

In [None]:
knowledge_type_labels = [col for col in internal_annotations.columns if "knowledge_type" in col]
knowledge_type_labels

In [None]:
for col in knowledge_type_labels:
    if "comment" not in col:
        print(internal_annotations[col].value_counts())

In [None]:
knowledge_type_labels_map = {"knowledge_type_news": "news/entertainment/pop culture", 
                             "knowledge_type_everyday": "everyday/world knowledge",
                             "knowledge_type_edu": "education",
                             "knowledge_type_art": "art/design/music",
                             "knowledge_type_maths": "maths",
                             "knowledge_type_lang": "language/linguistics",
                             "knowledge_type_commonsense": "commonsense",
                             "knowledge_type_encycl": "encyclopedic",
                             "knowledge_type_humanities": "humanities",
                             "knowledge_type_sosci": "social science",
                             "knowledge_type_stem": "science/technology/engineering",
                             "knowledge_type_medicine": "medicine/health",
                             "knowledge_type_business": "business/economics/finance",
                             "knowledge_type_other": "other"}

In [None]:
import matplotlib.ticker as mticker
plt.figure(figsize=(5,3))
knowledge_type_counts = []
knowledge_type_labels_no_comment = []
for col in knowledge_type_labels:
    if ("comment" not in col) and ("none" not in col):
        knowledge_type_counts += [sum(external_annotations[col] == 1)]
        knowledge_type_labels_no_comment += [knowledge_type_labels_map[col]]
sorted_lists = sorted(zip(knowledge_type_counts, knowledge_type_labels_no_comment), reverse=True)
knowledge_type_counts, knowledge_type_labels_no_comment = zip(*sorted_lists)
sns.barplot(x=np.array(knowledge_type_counts), y=np.array(knowledge_type_labels_no_comment), color=cc.cm.glasbey.colors[10] )
plt.gca().xaxis.set_major_locator(mticker.MultipleLocator(5))
plt.title("Domains (external annotations)")
plt.tight_layout()
os.makedirs("../images", exist_ok=True)
plt.savefig(os.path.join(annotations_folder, "images", "knowledge_types_ext.pdf"), dpi=300)

In [None]:
import matplotlib.ticker as mticker
plt.figure(figsize=(5,3))
knowledge_type_counts = []
knowledge_type_labels_no_comment = []
for col in knowledge_type_labels:
    if ("comment" not in col) and ("none" not in col):
        knowledge_type_counts += [sum(internal_annotations[col] == 1)]
        knowledge_type_labels_no_comment += [knowledge_type_labels_map[col]]
sorted_lists = sorted(zip(knowledge_type_counts, knowledge_type_labels_no_comment), reverse=True)
knowledge_type_counts, knowledge_type_labels_no_comment = zip(*sorted_lists)
sns.barplot(x=np.array(knowledge_type_counts), y=np.array(knowledge_type_labels_no_comment), color=cc.cm.glasbey.colors[10])
plt.gca().xaxis.set_major_locator(mticker.MultipleLocator(5))
plt.title("Domains (internal annotations)")
plt.tight_layout()
os.makedirs("../images", exist_ok=True)
plt.savefig(os.path.join(annotations_folder, "images", "knowledge_types.pdf"), dpi=300)

In [None]:
internal_annotations["bench_abbrev"]

In [None]:
internal_annotations.columns[:15]

In [None]:
bench = "scienceqa"
for col in knowledge_type_labels:
    if ("comment" not in col) and ("none" not in col):
        val = internal_annotations.loc[internal_annotations["bench_abbrev"] == bench, col]
        if val.values[0] != 0:
            print(col)

In [None]:
for col in knowledge_type_labels:
    if ("comment" not in col) and ("none" not in col):
        val = external_annotations.loc[external_annotations["bench_abbrev"] == bench, col]
        if val.values[0] != 0:
            print(col)

In [None]:
internal_annotations.loc[internal_annotations["bench_abbrev"] == bench, "knowledge_type_other_comment"]

## Data source

In [None]:
internal_annotations.loc[internal_annotations["bench_abbrev"] == bench, "source_concrete"].values

In [None]:
external_annotations.loc[external_annotations["bench_abbrev"] == bench, "source_concrete"].values

## What are the institutions?

In [None]:
all_inst_int = []
for b in internal_annotations["bench_abbrev"].values:
    all_inst_int += internal_annotations.loc[internal_annotations["bench_abbrev"] == b, "institution"].values[0].split(", ")
pd.Series(all_inst_int).value_counts()

In [None]:
all_inst_ext = []
for b in external_annotations["bench_abbrev"].values:
    all_inst_ext += external_annotations.loc[external_annotations["bench_abbrev"] == b, "institution"].values[0].split(", ")
pd.Series(all_inst_ext).value_counts()


## How was the data sourced?

In [None]:
source_how_labels = [col for col in internal_annotations.columns if "source_how" in col]

In [None]:
vc_int_ext_df.loc[(vc_int_ext_df["item"].str.contains("source_how")), :]

In [None]:
internal_annotations.loc[internal_annotations["source_how_web"] == 1, ["bench_abbrev", "source_how_web_comment"]]


In [None]:
external_annotations.loc[external_annotations["source_how_web"] == 1, ["bench_abbrev", "source_how_web_comment"]].values

In [None]:
external_annotations.loc[external_annotations["bench_abbrev"] == "copa", "source_how_dataset_comment"].values


In [None]:
sns.set_style("white")
data = vc_int_ext_df.loc[
    (vc_int_ext_df["item"].str.contains("source_how")), ["item", "internal_1"]].sort_values("internal_1", ascending=False)
relabel_map = {"human": "human-authored",
               "web": "open access/web data",
               "dataset": "reuse of existing AI/NLP dataset",
               "exam": "exams or textbooks",
               "private": "proprietary/internal source"}
data["item"] = data["item"].map(lambda l: l.replace("source_how_", ""))
data["item"] = data["item"].map(lambda l: relabel_map[l] if l in relabel_map else l)
#labels = [label.replace("goal_", "") for label in data.item]
#labels = [relabel_map[l] if l in relabel_map else l for l in labels]
g = sns.catplot(
    data=data, kind="bar",
    x="internal_1", y="item",  width=0.8,
    color=cc.cm.glasbey.colors[10]
)
g.despine(left=True)
g.set_axis_labels("Count", "")
plt.xticks(np.arange(0, 21, 5))
plt.title("Reported data source", fontsize=12)
plt.gcf().set_size_inches(4,2)
plt.savefig(os.path.join(annotations_folder, "images", 'data_collection.pdf'), format='pdf', dpi=300, bbox_inches='tight')

In [None]:
sns.set_style("white")
data = vc_int_ext_df.loc[
    (vc_int_ext_df["item"].str.contains("source_how")), ["item", "external_1"]].sort_values("external_1", ascending=False)
relabel_map = {"human": "human-authored",
               "web": "open access/web data",
               "dataset": "reuse of existing AI/NLP dataset",
               "exam": "exams or textbooks",
               "private": "proprietary/internal source"}
data["item"] = data["item"].map(lambda l: l.replace("source_how_", ""))
data["item"] = data["item"].map(lambda l: relabel_map[l] if l in relabel_map else l)
#labels = [label.replace("goal_", "") for label in data.item]
#labels = [relabel_map[l] if l in relabel_map else l for l in labels]
g = sns.catplot(
    data=data, kind="bar",
    x="external_1", y="item", width=0.8,
    color=cc.cm.glasbey.colors[10]
)
g.despine(left=True)
g.set_axis_labels("Count", "")
plt.xticks(np.arange(0, 21, 5))
plt.title("Reported data source\n(external annotation)", fontsize=12)
plt.gcf().set_size_inches(4,2)
plt.savefig(os.path.join(annotations_folder, "images", 'data_collection_ext.pdf'), format='pdf', dpi=300, bbox_inches='tight')

## Topics/domains

In [None]:
internal_annotations.loc[internal_annotations["knowledge_type_other"] == 1, "knowledge_type_other_comment"]

In [None]:
external_annotations.loc[external_annotations["knowledge_type_other"] == 1, "knowledge_type_other_comment"]

## Publication years

In [None]:
benchmarks_overview = benchmarks_overview.rename({"Benchmark": "bench_abbrev"}, axis=1)
benchmarks_overview.columns

In [None]:
internal_plus_year = internal_annotations.merge(benchmarks_overview, on="bench_abbrev", how='left')
internal_plus_year.head()

In [None]:
knowledge_type_year = []
knowledge_type_labels_no_comment = []
for col in knowledge_type_labels:
    if ("comment" not in col) and ("none" not in col):
        years = internal_plus_year.loc[internal_plus_year[col]==1, "Year"].to_list()
        knowledge_type_year += years
        knowledge_type_labels_no_comment += [knowledge_type_labels_map[col]] * len(years)
sorted_lists = sorted(zip(knowledge_type_year, knowledge_type_labels_no_comment), reverse=True)
internal_plus_year_df = pd.DataFrame(sorted_lists, columns=["year", "domain"])
internal_plus_year_df = internal_plus_year_df.dropna()
#knowledge_type_year, knowledge_type_labels_no_comment = zip(*sorted_lists)

In [None]:
internal_plus_year.loc[:, ["Year", "bench_abbrev"]].sort_values(by="Year")

In [None]:
internal_plus_year_df.groupby(['year']).value_counts()

In [None]:
internal_plus_year_counted_df = internal_plus_year_df.groupby(['year']).value_counts().reset_index().rename(columns={"index": "year", 0: "count"})
internal_plus_year_counted_df.head()

In [None]:
plt.figure(figsize=((7,5)))
sns.scatterplot(data=internal_plus_year_counted_df, x="year", y="count", hue="domain")
plt.tight_layout()

## Bias & toxicity

In [None]:
print_diff_for_col("bias_toxicity_none")
external_annotations.loc[external_annotations["bench_abbrev"] == "naturalquestions", [col for col in external_annotations.columns if "bias" in col]]

In [None]:
sum(internal_annotations["bias_toxicity_none"])

In [None]:
sum(external_annotations["bias_toxicity_none"])

In [None]:
internal_annotations[internal_annotations["bias_toxicity_toxic"]==1]


In [None]:
external_annotations[external_annotations["bias_toxicity_toxic"]==1]

## Benchmark goal

In [None]:
for col in goal_labels:
    print(col, sum(internal_annotations[col]))

In [None]:
goal_labels = [col for col in internal_annotations.columns if "goal" in col and not "comment" in col]

In [None]:
for col in goal_labels:
    print(col, sum(external_annotations[col]))

In [None]:
goals_color_map = map_list_to_color(goal_labels)
data = vc_int_ext_df.loc[
    (vc_int_ext_df["item"].str.contains("goal")) & (vc_int_ext_df["internal_1"] > 0), ["item", "internal_1"]]
data = data.sort_values(by="internal_1")
plt.figure(figsize=(6, 6))
sns.set_style("whitegrid")
relabel_map = {"new_task": "to define a new task", "realistic": "more realistic questions compared to existing benchmarks", "difficulty": "increased difficulty compared to existing benchmarks"}
x = data.internal_1
colors = [goals_color_map[l] for l in data.item]
labels = [label.replace("goal_", "") for label in data.item]
labels = [relabel_map[l] if l in relabel_map else l for l in labels]
patches, texts, autotexts = plt.pie(x, labels=labels, autopct=(lambda p: '{:.0f}'.format(p * 30 / 100)), colors=colors)
for txt in texts:
    txt.set_fontsize(16)
for autotext in autotexts:
    autotext.set_color('white')
    autotext.set_fontsize(16)
# Add a title
plt.title("Reported motivation", fontsize=18)
plt.get_figlabels()
# Display the plot
plt.savefig(os.path.join(annotations_folder, "images", 'motivation.pdf'), format='pdf', dpi=300,
            bbox_inches='tight')

In [None]:
int_sorted = vc_int_ext_df.loc[vc_int_ext_df["item"].str.contains("goal"), ["item", "internal_1"]].sort_values("internal_1", ascending=False)
int_sorted

In [None]:
ext_sorted = vc_int_ext_df.loc[vc_int_ext_df["item"].str.contains("goal"), ["item", "external_1"]].sort_values("external_1", ascending=False)
ext_sorted

In [None]:
internal_annotations.loc[internal_annotations["goal_other"] > 0, ["bench_abbrev", "goal_other_comment"]]

In [None]:
external_annotations.loc[external_annotations["goal_represent"] > 0, ["bench_abbrev", "goal_represent_comment"]].values[0]


In [None]:
sns.set_style("white")
data = vc_int_ext_df.loc[
    (vc_int_ext_df["item"].str.contains("goal")), ["item", "internal_1"]].sort_values("internal_1", ascending=False)
relabel_map = {"new_task": "to define a new task",
               "realistic": "more realistic questions",
               "difficulty": "increased difficulty",
               "less_difficulty": "decreased difficulty",
               "represent": "better social representativeness"}
data["item"] = data["item"].map(lambda l: l.replace("goal_", ""))
data["item"] = data["item"].map(lambda l: relabel_map[l] if l in relabel_map else l)
#labels = [label.replace("goal_", "") for label in data.item]
#labels = [relabel_map[l] if l in relabel_map else l for l in labels]
g = sns.catplot(
    data=data, kind="bar",
    x="internal_1", y="item", width=0.8,
    color=cc.cm.glasbey.colors[10]
)
g.despine(left=True)
g.set_axis_labels("Count", "")
plt.xticks(np.arange(0, 21, 5))
plt.title("Reported motivation", fontsize=12)
plt.gcf().set_size_inches(4,1.7)
plt.savefig(os.path.join(annotations_folder, "images", 'motivation.pdf'), format='pdf', dpi=300, bbox_inches='tight')

In [None]:
sns.set_style("white")
data = vc_int_ext_df.loc[
    (vc_int_ext_df["item"].str.contains("goal")), ["item", "external_1"]].sort_values("external_1", ascending=False)
relabel_map = {"new_task": "to define a new task",
               "realistic": "more realistic questions",
               "difficulty": "increased difficulty",
               "less_difficulty": "decreased difficulty",
               "represent": "better social representativeness"}
data["item"] = data["item"].map(lambda l: l.replace("goal_", ""))
data["item"] = data["item"].map(lambda l: relabel_map[l] if l in relabel_map else l)
#labels = [label.replace("goal_", "") for label in data.item]
#labels = [relabel_map[l] if l in relabel_map else l for l in labels]
g = sns.catplot(
    data=data, kind="bar",
    x="external_1", y="item", width=0.8,
    color=cc.cm.glasbey.colors[10]
)
g.despine(left=True)
g.set_axis_labels("Count", "")
plt.xticks(np.arange(0, 21, 5))

plt.title("Reported motivation\n(external annotation)", fontsize=12)
plt.gcf().set_size_inches(4,1.7)
plt.savefig(os.path.join(annotations_folder, "images", 'motivation_ext.pdf'), format='pdf', dpi=300, bbox_inches='tight')

In [None]:
relabel_map = {"human": "human-authored",
               "web": "open access/web data",
               "dataset": "reuse of existing AI/NLP dataset",
               "exam": "exams or textbooks",
               "private": "proprietary/internal source"}

## Transparent benchmarks

In [None]:
none_cols = [col for col in internal_annotations.columns if "none" in col]


In [None]:
none_data_no_comments_cols = [col for col in none_cols if "comment" not in col]
len(none_data_no_comments_cols)

In [None]:
benchmarks = internal_annotations["bench_abbrev"].values
benchmarks

In [None]:
for b in benchmarks:
    sum_none = internal_annotations.loc[internal_annotations["bench_abbrev"] == b, none_data_no_comments_cols].sum(axis=1)
    print(b, sum_none.values)

In [None]:
for b in benchmarks:
    sum_none = external_annotations.loc[external_annotations["bench_abbrev"] == b, none_data_no_comments_cols].sum(axis=1)
    print(b, sum_none.values)

In [None]:
internal_annotations.loc[internal_annotations["identity_demographic_none"] == 0, "bench_abbrev"].values

In [None]:
internal_annotations.loc[internal_annotations["recruitment_criteria_none"] == 0, "bench_abbrev"].values


In [None]:
internal_annotations.loc[internal_annotations["bias_toxicity_none"] == 0, "bench_abbrev"].values
