In [1]:
import pandas as pd
from util.data_structure_helper import get_intersection, min_intersected
from itertools import chain
from collections import Counter
from util.annotator_helper import form_label_dataframes, get_agreement


In [2]:
annotator2 = pd.read_csv(
    "data/acl/sections/labelled/manuel/sections_grouped_orderby-stars/raw/annotator2.csv")
annotator3 = pd.read_csv(
    "data/acl/sections/labelled/manuel/sections_grouped_orderby-stars/raw/annotator3.csv")
annotator1 = pd.read_csv(
    "data/acl/sections/labelled/manuel/sections_grouped_orderby-stars/raw/annotator1.csv")


In [3]:
annotator2_repo = annotator2.groupby("repo").agg({"label": "unique"}).reset_index()
annotator2_repo.label = annotator2_repo.label.apply(
    lambda x: [item for sublist in x for item in sublist.split("#")])
annotator2_repo.label = annotator2_repo.label.apply(
    lambda x: [i for i in x if i != "irrelevant"])


annotator3_repo = annotator3.groupby("repo").agg({"label": "unique"}).reset_index()
annotator3_repo.label = annotator3_repo.label.apply(
    lambda x: [item for sublist in x for item in sublist.split("#")])
annotator3_repo.label = annotator3_repo.label.apply(
    lambda x: [i for i in x if i != "irrelevant"])

annotator1_repo = annotator1.groupby("repo").agg({"label": "unique"}).reset_index()
annotator1_repo.label = annotator1_repo.label.apply(
    lambda x: [item for sublist in x for item in sublist.split("#")])
annotator1_repo.label = annotator1_repo.label.apply(
    lambda x: [i for i in x if i != "irrelevant"])


In [4]:
merged_repos = pd.merge(annotator2_repo, annotator3_repo, on="repo", how="inner").merge(
    annotator1_repo, on="repo", how="inner")
# merged_repos = pd.merge(annotator2_repo, annotator3_repo, on="repo", how="outer").merge(
#     annotator1_repo, on="repo", how="outer")
merged_repos.rename(columns={"label_x": "label_annotator2",
                             "label_y": "label_annotator3", "label": "label_annotator1"}, inplace=True)
merged_repos = merged_repos.fillna("").apply(list)


In [5]:
merged_repos["intersection"] = merged_repos.apply(
    lambda x: get_intersection([x["label_annotator2"], x["label_annotator3"], x["label_annotator1"]]), axis=1)
merged_repos["min_intersection"] = merged_repos.apply(
    lambda x: min_intersected([x["label_annotator2"], x["label_annotator3"], x["label_annotator1"]], 2), axis=1)
merged_repos.head()


Unnamed: 0,repo,label_annotator2,label_annotator3,label_annotator1,intersection,min_intersection
0,Adapter-Hub/adapter-transformers,[requirements],"[requirements, introduction]","[requirements, evaluation, pretrained_model, r...",[requirements],[requirements]
1,ChenRocks/fast_abs_rl,"[requirements, evaluation, pretrained_model, t...","[introduction, requirements, evaluation, requi...","[introduction, requirements, evaluation, requi...","[training, pretrained_model, requirements, eva...","[introduction, requirements, training, evaluat..."
2,EagleW/PaperRobot,[],[introduction],[introduction],[],[introduction]
3,Embedding/Chinese-Word-Vectors,[evaluation],"[introduction, requirements, evaluation]","[introduction, pretrained_model, results, eval...",[evaluation],"[introduction, evaluation]"
4,EmilyAlsentzer/clinicalBERT,[pretrained_model],"[introduction, requirements, pretrained_model]","[introduction, pretrained_model]",[pretrained_model],"[introduction, pretrained_model]"


In [6]:
comparison_df = merged_repos.copy()
comparison_df["label_annotator2_str"] = comparison_df["label_annotator2"].apply(
    lambda x: "".join(sorted(x)))
comparison_df["label_annotator3_str"] = comparison_df["label_annotator3"].apply(
    lambda x: "".join(sorted(x)))
comparison_df["label_annotator1_str"] = comparison_df["label_annotator1"].apply(
    lambda x: "".join(sorted(x)))


In [7]:
labels = comparison_df[['label_annotator2_str',
                        'label_annotator3_str', 'label_annotator1_str']]
formed_labels = form_label_dataframes(labels)
get_agreement(formed_labels, "weighted_kappa")


0.10608479572078666

In [8]:
print("Number of repos with min intersection equal to intersection: ",
      merged_repos[merged_repos.intersection == merged_repos.min_intersection].shape[0])
print("Number of repos with min intersection not equal to intersection: ",
      merged_repos[merged_repos.intersection != merged_repos.min_intersection].shape[0])


Number of repos with min intersection equal to intersection:  50
Number of repos with min intersection not equal to intersection:  157


In [9]:
merged_repos["union"] = merged_repos[["label_annotator2", "label_annotator3",
                                      "label_annotator1"]].apply(lambda x: list(set(chain.from_iterable(x))), axis=1)


In [10]:
Counter(list(chain.from_iterable(merged_repos.min_intersection.values.tolist())))


Counter({'requirements': 141,
         'introduction': 146,
         'training': 53,
         'evaluation': 68,
         'pretrained_model': 59,
         'results': 28})

In [11]:
Counter(list(chain.from_iterable(merged_repos.intersection.values.tolist())))


Counter({'requirements': 102,
         'training': 31,
         'pretrained_model': 27,
         'evaluation': 32,
         'introduction': 58,
         'results': 11})

In [12]:
Counter(list(chain.from_iterable(merged_repos.union.values.tolist())))


Counter({'introduction': 179,
         'requirements': 183,
         'training': 95,
         'evaluation': 136,
         'pretrained_model': 103,
         'results': 60})

In [13]:
# merged_repos[["repo", "intersection"]].to_json(
#     "data/acl/sections/labelled/manuel/sections_grouped_orderby-stars/processed/intersection.json", orient="records")
# merged_repos[["repo", "min_intersection"]].to_json(
#     "data/acl/sections/labelled/manuel/sections_grouped_orderby-stars/processed/min_intersection.json", orient="records")
# merged_repos[["repo", "union"]].to_json(
#     "data/acl/sections/labelled/manuel/sections_grouped_orderby-stars/processed/union.json", orient="records")
