In [36]:
import glob
import pandas as pd
from e2e_system.reproder import _ground_truth
from sklearn.metrics import cohen_kappa_score
from sklearn.metrics import precision_recall_fscore_support, accuracy_score
from util.file_helper import read_all_csv_files
from util.annotator_helper import form_label_dataframes, get_agreement


In [47]:
def get_agrmnt(df, metric):
    true_columns = list(_ground_truth)
    pred_columns = [c for c in df.columns if "_pred" in c]
    df["true_score"] = df[true_columns].sum(axis=1)
    df["pred_score"] = df[pred_columns].sum(axis=1)
    if metric == "quadratic":
        aggreement = cohen_kappa_score(
            df.true_score, df.pred_score, weights="quadratic")
    else:
        aggreement = get_agreement(form_label_dataframes(
            df[["true_score", "pred_score"]]), metric)
    return round(aggreement, 3)


def get_correlation(df, round_step=3):
    crr = df.true_reprod_score.corr(other=df.reprod_score)
    return round(crr, round_step)


def analyze_reprod_score(df):
    mean_diff = df.true_reprod_score.mean() - df.reprod_score.mean()
    return round(mean_diff, 3)


def get_evaluation_metric_results(df):
    labelled = df.select_dtypes(include=["bool"])
    pred_columns = [
        column for column in labelled.columns if column.endswith("_pred")]
    true_columns = [
        column for column in labelled.columns if not column.endswith("_pred")]
    pred = labelled[pred_columns].values.flatten()
    true = labelled[true_columns].values.flatten()
    accuracy = accuracy_score(true, pred)
    precision, recall, f1, _ = precision_recall_fscore_support(
        true, pred, average='binary')
    return round(accuracy, 3), round(precision, 3), round(recall, 3), round(f1, 3)


def get_labelling_score(method, selection):
    path = f"data/acl/sections/labelled/{method}/{selection}.csv"
    df = pd.read_csv(path, index_col=0)
    return round(df.score.mean(), 3)


In [39]:
evaluations = glob.glob(
    "data/paperswithcode/evaluated/classification/bert_base_uncased/neurips/**")


In [56]:
res = []
for evaluation in evaluations:
    results = read_all_csv_files(f"{evaluation}/")
    for key in results.keys():
        correlation = get_correlation(results[key])
        # kappa = get_agrmnt(results[key], "alpha")
        mean_diff = analyze_reprod_score(results[key])
        accuracy, precision, recall, f1 = get_evaluation_metric_results(
            results[key])
        evaluation = evaluation.split("\\")[-1]
        res.append(
            {
                "correlation": correlation,
                "kappa": get_agrmnt(results[key], "kappa"),
                "weighted_linear": get_agrmnt(results[key], "weighted_kappa"),
                "weighted_quadratic": get_agrmnt(results[key], "quadratic"),
                "fleiss": get_agrmnt(results[key], "multi_kappa"),
                "scott": get_agrmnt(results[key], "pi"),
                "bennett": get_agrmnt(results[key], "S"),
                "alpha": get_agrmnt(results[key], "alpha"),
                "avg_Ao": get_agrmnt(results[key], "avg_Ao"),
     
                # "mean_diff": mean_diff,
                # "accuracy": accuracy,
                # "precision": precision,
                # "recall": recall,
                # "f1": f1,
                "labelling_method": evaluation,
                "type": key.replace("neurips_", ""),
                # "labelling_score": get_labelling_score(evaluation.split("_")[-1], "_".join(evaluation.split("_")[:-1]))
            }
        )


In [50]:
analysis = pd.DataFrame(res)
analysis["method"] = analysis.labelling_method.str.split("_").str[-1]
analysis["labelling_method"] = analysis.labelling_method.apply(
    lambda x: " ".join(x.split("_")[:-1]))


In [57]:
# pd.DataFrame(res).set_index(["labelling_method", "type"]).round(3).style.highlight_max(
#     color="green", axis=0).highlight_min(subset=["mean_diff"], color="red", axis=0).format("{:.3f}")
pd.DataFrame(res).set_index(["labelling_method", "type"]).round(3).style.highlight_max(
    color="green", axis=0).format("{:.3f}")


Unnamed: 0_level_0,Unnamed: 1_level_0,correlation,kappa,weighted_linear,weighted_quadratic,fleiss,scott,bennett,alpha,avg_Ao
labelling_method,type,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
content_textsim,base,0.549,0.177,0.177,0.521,0.177,0.174,0.232,0.178,0.36
content_textsim,consecutive,0.554,0.177,0.177,0.521,0.177,0.174,0.232,0.178,0.36
content_textsim,grouped,0.376,0.094,0.094,0.337,0.094,0.087,0.148,0.092,0.29
content_textsim,grouped_consecutive,0.37,0.094,0.094,0.337,0.094,0.087,0.148,0.092,0.29
content_zeroshot,base,0.582,0.155,0.155,0.563,0.155,0.152,0.208,0.156,0.34
content_zeroshot,consecutive,0.586,0.155,0.155,0.563,0.155,0.152,0.208,0.156,0.34
content_zeroshot,grouped,0.431,0.077,0.077,0.371,0.077,0.064,0.124,0.068,0.27
content_zeroshot,grouped_consecutive,0.434,0.077,0.077,0.371,0.077,0.064,0.124,0.068,0.27
grouped_textsim,base,0.579,0.145,0.145,0.542,0.145,0.137,0.196,0.141,0.33
grouped_textsim,consecutive,0.581,0.145,0.145,0.542,0.145,0.137,0.196,0.141,0.33


In [64]:
# pd.DataFrame(res).set_index(["labelling_method", "type"]).sort_values(by=["correlation"], ascending=False).round(3).style.highlight_max(
#     color="green", axis=0).highlight_min(subset=["mean_diff"], color="red", axis=0).format("{:.3f}")
pd.DataFrame(res).set_index(["labelling_method", "type"]).sort_values(by=["correlation"], ascending=False).round(3).style.highlight_max(
    color="green", axis=0).format("{:.3f}")


Unnamed: 0_level_0,Unnamed: 1_level_0,correlation,kappa,weighted_linear,weighted_quadratic,fleiss,scott,bennett,alpha,avg_Ao
labelling_method,type,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
grouped_zeroshot,base,0.651,0.322,0.322,0.648,0.322,0.318,0.364,0.321,0.47
grouped_zeroshot,consecutive,0.661,0.322,0.322,0.648,0.322,0.318,0.364,0.321,0.47
header_plus_content_zeroshot,consecutive,0.624,0.204,0.204,0.556,0.204,0.198,0.244,0.202,0.37
header_plus_content_zeroshot,base,0.617,0.204,0.204,0.556,0.204,0.198,0.244,0.202,0.37
header_content_textsim,consecutive,0.571,0.193,0.193,0.523,0.193,0.188,0.232,0.192,0.36
header_content_textsim,base,0.578,0.193,0.193,0.523,0.193,0.188,0.232,0.192,0.36
content_textsim,base,0.549,0.177,0.177,0.521,0.177,0.174,0.232,0.178,0.36
content_textsim,consecutive,0.554,0.177,0.177,0.521,0.177,0.174,0.232,0.178,0.36
header_plus_content_textsim,grouped,0.417,0.162,0.162,0.374,0.162,0.158,0.208,0.163,0.34
header_plus_content_textsim,grouped_consecutive,0.414,0.162,0.162,0.374,0.162,0.158,0.208,0.163,0.34


In [65]:
analysis[analysis.type.isin(["grouped", "base"])].groupby(
    "type").mean(numeric_only=True).round(3).reset_index()


Unnamed: 0,type,correlation,kappa,fleiss,scott,bennett,alpha,weighted_kappa,quadratic,accuracy
0,base,0.571,0.159,0.159,0.154,0.21,0.158,0.159,0.547,0.663
1,grouped,0.396,0.102,0.102,0.095,0.15,0.099,0.102,0.366,0.611


In [66]:
analysis
0.312
0.30

Unnamed: 0,correlation,kappa,fleiss,scott,bennett,alpha,weighted_kappa,quadratic,accuracy,labelling_method,type,method
0,0.549,0.177,0.177,0.174,0.232,0.178,0.177,0.521,0.665,content,base,textsim
1,0.554,0.177,0.177,0.174,0.232,0.178,0.177,0.521,0.665,content,consecutive,textsim
2,0.376,0.094,0.094,0.087,0.148,0.092,0.094,0.337,0.602,content,grouped,textsim
3,0.37,0.094,0.094,0.087,0.148,0.092,0.094,0.337,0.602,content,grouped_consecutive,textsim
4,0.582,0.155,0.155,0.152,0.208,0.156,0.155,0.563,0.662,content,base,zeroshot
5,0.586,0.155,0.155,0.152,0.208,0.156,0.155,0.563,0.662,content,consecutive,zeroshot
6,0.431,0.077,0.077,0.064,0.124,0.068,0.077,0.371,0.61,content,grouped,zeroshot
7,0.434,0.077,0.077,0.064,0.124,0.068,0.077,0.371,0.61,content,grouped_consecutive,zeroshot
8,0.579,0.145,0.145,0.137,0.196,0.141,0.145,0.542,0.697,grouped,base,textsim
9,0.581,0.145,0.145,0.137,0.196,0.141,0.145,0.542,0.697,grouped,consecutive,textsim


In [73]:
analysis.groupby(["method", "labelling_method"]).mean(numeric_only=True).round(
    3)


Unnamed: 0_level_0,Unnamed: 1_level_0,correlation,kappa,fleiss,scott,bennett,alpha,weighted_kappa,quadratic,accuracy
method,labelling_method,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
textsim,content,0.462,0.136,0.136,0.13,0.19,0.135,0.136,0.429,0.634
textsim,grouped,0.495,0.132,0.132,0.125,0.184,0.13,0.132,0.462,0.656
textsim,header,0.436,0.076,0.076,0.066,0.13,0.07,0.076,0.411,0.624
textsim,header content,0.484,0.17,0.17,0.164,0.214,0.168,0.17,0.422,0.646
textsim,header plus,0.501,0.088,0.088,0.081,0.142,0.086,0.088,0.485,0.638
textsim,header plus content,0.492,0.154,0.154,0.149,0.202,0.154,0.154,0.451,0.661
zeroshot,content,0.508,0.116,0.116,0.108,0.166,0.112,0.116,0.467,0.636
zeroshot,grouped,0.52,0.238,0.238,0.232,0.28,0.236,0.238,0.524,0.672
zeroshot,header,0.373,0.122,0.122,0.118,0.172,0.123,0.122,0.391,0.575
zeroshot,header content,0.511,0.119,0.119,0.113,0.166,0.118,0.119,0.528,0.644
