In [97]:
import pandas as pd
from collections import Counter

from util.file_helper import read_all_csv_files
from util.data_structure_helper import dict_to_df
from util.annotator_helper import (
    form_label_dataframes,
    get_agreement,
    fix_multiple_labels,
    select_label_columns)


In [98]:
metrics = ['kappa', 'alpha', 'avg_Ao', 'multi_kappa', 'S', 'pi']
labellings = read_all_csv_files(
    'data/acl/sections/labelled/manuel/text_sim_header/raw/')
dfs_dict = select_label_columns(labellings)
annotators = sorted(dfs_dict.keys())
selected_metric = "weighted_kappa"
dfs = dict_to_df(dfs_dict)


In [99]:
multilabel_data = form_label_dataframes(dfs)
print(
    f"Agreement with multilabels: {round(get_agreement(multilabel_data, selected_metric),3)}")


label_fixed_df, not_intersected_count, intersection_counts, number_of_multi_labels = fix_multiple_labels(
    dfs)
data = form_label_dataframes(label_fixed_df)
print(
    f"Agreement without multilabels: {round(get_agreement(data, selected_metric),3)}")
print(f"Number of records with multiple labels: {number_of_multi_labels}")
print(f"Number of records with no intersection: {not_intersected_count}")
print(
    f"Number of records with intersection: {number_of_multi_labels - not_intersected_count}")


Agreement with multilabels: 0.45
Agreement without multilabels: 0.587
Number of records with multiple labels: 159
Number of records with no intersection: 96
Number of records with intersection: 63


In [100]:
intersection_counts


{'requirements': 13,
 'evaluation': 25,
 'training': 14,
 'pre-trained models': 6,
 'results': 2,
 'introduction': 3}

In [101]:
print("Single-labelled Human Agreement Range Statistics")
score_ranges = [0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]

for score_range in score_ranges:
    frames = select_label_columns(labellings, score_range)
    frames = dict_to_df(frames)
    ranged_label_fixed, _, _, _ = fix_multiple_labels(frames)
    data = form_label_dataframes(ranged_label_fixed)
    print(f"Score range: {score_range} - Agreement: {round(get_agreement(data, selected_metric), 3)} - Number of records: {len(ranged_label_fixed)}")


Single-labelled Human Agreement Range Statistics
Score range: 0 - Agreement: 0.587 - Number of records: 1050
Score range: 0.1 - Agreement: 0.588 - Number of records: 1045
Score range: 0.2 - Agreement: 0.591 - Number of records: 1000
Score range: 0.3 - Agreement: 0.602 - Number of records: 835
Score range: 0.4 - Agreement: 0.603 - Number of records: 603
Score range: 0.5 - Agreement: 0.647 - Number of records: 342
Score range: 0.6 - Agreement: 0.702 - Number of records: 128
Score range: 0.7 - Agreement: 0.745 - Number of records: 59
Score range: 0.8 - Agreement: 0.813 - Number of records: 21
Score range: 0.9 - Agreement: 1.0 - Number of records: 2


In [102]:
agreed_labels = label_fixed_df[label_fixed_df.eq(
    label_fixed_df.iloc[:, 0], axis=0).all(1)]
agreed_label_count = len(agreed_labels)
disagreed_label_count = len(label_fixed_df) - agreed_label_count
print(f"Agreed label count: {agreed_label_count}")
print(f"Disagreed label count: {disagreed_label_count}")


Agreed label count: 580
Disagreed label count: 470


In [103]:
agreed_labels["annotator1"].value_counts().to_dict()


{'requirements': 181,
 'irrelevant': 111,
 'evaluation': 95,
 'training': 85,
 'introduction': 75,
 'pre-trained models': 22,
 'results': 11}

In [104]:
disagreed_df = label_fixed_df.drop(agreed_labels.index)
disagreed_df_cp = disagreed_df.copy()
for _, row in disagreed_df.iterrows():
    labels = row.values
    common_label, count = Counter(labels).most_common(1)[0]
    if count == 2:
        disagreed_df_cp.loc[row.name, "partially_agreed_lbl"] = common_label
    else:
        disagreed_df_cp.loc[row.name, "partially_agreed_lbl"] = "-"

print(f"Disagreed label count: {len(disagreed_df_cp)}")
print(
    f"Partially agreed label count: {len(disagreed_df_cp[disagreed_df_cp['partially_agreed_lbl'] != '-'])}")
print(
    f"Partially disagreed label count: {len(disagreed_df_cp[disagreed_df_cp['partially_agreed_lbl'] == '-'])}")


Disagreed label count: 470
Partially agreed label count: 355
Partially disagreed label count: 115


In [105]:
partially_agreed_labels = disagreed_df_cp[disagreed_df_cp['partially_agreed_lbl'] != '-']
agreed_labels.assign(partially_agreed_lbl=agreed_labels["annotator1"])
label_fixed_df["human"] = ""
label_fixed_df.loc[agreed_labels.index, "human"] = agreed_labels["annotator1"]
label_fixed_df.loc[partially_agreed_labels.index,
                   "human"] = partially_agreed_labels["partially_agreed_lbl"]
partial_labels = label_fixed_df[label_fixed_df["human"] != ""][["human"]]
partial_labels.loc[:,
                   "text"] = labellings["annotator1"].loc[partial_labels.index].text
print(f"Lenght of partial labels: {len(partial_labels)}")


Lenght of partial labels: 935


In [106]:
irrelevants = partial_labels[partial_labels.human == "irrelevant"]
not_irrelevants = partial_labels.loc[partial_labels.index.difference(
    irrelevants.index)]
print(f"Number of irrelevants: {len(irrelevants)}")
print(f"Number of not irrelevants: {len(not_irrelevants)}")


Number of irrelevants: 204
Number of not irrelevants: 731


In [107]:
# Index finding

not_irrelevants.sort_values(by=['text'], inplace=True)
not_irrelevants.human.replace(
    to_replace='pre-trained models', value="pretrained_model", inplace=True)
df = pd.read_csv("data/acl/sections/sections_header_clean.csv")
df['parent_header'] = df['parent_header'].fillna("")
df["text"] = df[["parent_header", "header", "content"]].agg("\n".join, axis=1)
df["text"] = df["text"].str.removeprefix("\n")
indexes = df[df.text.isin(not_irrelevants.text)].sort_values(by="text").index
not_irrelevants["data_index"] = indexes


In [108]:
df_clean = pd.read_csv("data/acl/sections/sections_clean.csv", index_col=0)
not_irrelevants.drop(indexes.difference(df_clean.index), inplace=True)
not_irrelevants.set_index("data_index", inplace=True)
# not_irrelevants.to_csv(
#     "data/acl/sections/labelled/manuel/text_sim_header/processed/partially_agreed_non_irrelevants.csv")
