In [1]:
import glob
import pandas as pd
from e2e_system.reproder import _ground_truth
from sklearn.metrics import cohen_kappa_score
from sklearn.metrics import precision_recall_fscore_support, accuracy_score
from util.file_helper import read_all_csv_files


In [2]:
def get_kappa(df):
    true_columns = list(_ground_truth)
    pred_columns = [c for c in df.columns if "_pred" in c]
    df["true_score"] = df[true_columns].sum(axis=1)
    df["pred_score"] = df[pred_columns].sum(axis=1)

    return round(cohen_kappa_score(df.true_score, df.pred_score, weights="quadratic"), 3)


def get_correlation(df, round_step=3):
    crr = df.true_reprod_score.corr(other=df.reprod_score)
    return round(crr, round_step)


def analyze_reprod_score(df):
    mean_diff = df.true_reprod_score.mean() - df.reprod_score.mean()
    return round(mean_diff, 3)


def get_evaluation_metric_results(df):
    labelled = df.select_dtypes(include=["bool"])
    pred_columns = [
        column for column in labelled.columns if column.endswith("_pred")]
    true_columns = [
        column for column in labelled.columns if not column.endswith("_pred")]
    pred = labelled[pred_columns].values.flatten()
    true = labelled[true_columns].values.flatten()
    accuracy = accuracy_score(true, pred)
    precision, recall, f1, _ = precision_recall_fscore_support(
        true, pred, average='binary')
    return round(accuracy, 3), round(precision, 3), round(recall, 3), round(f1, 3)


def get_labelling_score(method, selection):
    path = f"data/acl/sections/labelled/{method}/{selection}.csv"
    df = pd.read_csv(path, index_col=0)
    return round(df.score.mean(), 3)


In [3]:
evaluations = glob.glob(
    "data/paperswithcode/evaluated/classification/bert_base_uncased/neurips/**")


In [4]:
res = []
for evaluation in evaluations:
    results = read_all_csv_files(f"{evaluation}/")
    for key in results.keys():
        correlation = get_correlation(results[key])
        kappa = get_kappa(results[key])
        mean_diff = analyze_reprod_score(results[key])
        accuracy, precision, recall, f1 = get_evaluation_metric_results(
            results[key])
        evaluation = evaluation.split("\\")[-1]
        res.append(
            {
                "correlation": correlation,
                "kappa": kappa,
                # "mean_diff": mean_diff,
                "accuracy": accuracy,
                # "precision": precision,
                # "recall": recall,
                # "f1": f1,
                "labelling_method": evaluation,
                "type": key.replace("neurips_", ""),
                # "labelling_score": get_labelling_score(evaluation.split("_")[-1], "_".join(evaluation.split("_")[:-1]))
            }
        )


In [5]:
analysis = pd.DataFrame(res)
analysis["method"] = analysis.labelling_method.str.split("_").str[-1]
analysis["labelling_method"] = analysis.labelling_method.apply(
    lambda x: " ".join(x.split("_")[:-1]))


In [6]:
# pd.DataFrame(res).set_index(["labelling_method", "type"]).round(3).style.highlight_max(
#     color="green", axis=0).highlight_min(subset=["mean_diff"], color="red", axis=0).format("{:.3f}")
x = pd.DataFrame(res).set_index(["labelling_method", "type"]).reset_index().round(3)
x["method"] = x.labelling_method.str.split("_").str[-1]
x.sort_values(by=["method", "labelling_method"]).set_index(["method", "labelling_method", "type"]).round(3).style.highlight_min(
    color="green", axis=0).format("{:.3f}")


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,correlation,kappa,accuracy
method,labelling_method,type,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
textsim,content_textsim,base,0.549,0.521,0.665
textsim,content_textsim,consecutive,0.554,0.521,0.665
textsim,content_textsim,grouped,0.376,0.337,0.602
textsim,content_textsim,grouped_consecutive,0.37,0.337,0.602
textsim,grouped_textsim,base,0.579,0.542,0.697
textsim,grouped_textsim,consecutive,0.581,0.542,0.697
textsim,grouped_textsim,grouped,0.41,0.382,0.615
textsim,grouped_textsim,grouped_consecutive,0.412,0.382,0.615
textsim,header_content_textsim,base,0.578,0.523,0.685
textsim,header_content_textsim,consecutive,0.571,0.523,0.685


In [14]:
# pd.DataFrame(res).set_index(["labelling_method", "type"]).sort_values(by=["correlation"], ascending=False).round(3).style.highlight_max(
#     color="green", axis=0).highlight_min(subset=["mean_diff"], color="red", axis=0).format("{:.3f}")

pd.DataFrame(res).set_index(["labelling_method", "type"]).sort_values(by=["correlation"], ascending=False).round(3).style.highlight_max(
    color="green", axis=0).format("{:.3f}")


Unnamed: 0_level_0,Unnamed: 1_level_0,correlation,kappa,accuracy
labelling_method,type,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
grouped_zeroshot,consecutive,0.661,0.648,0.697
grouped_zeroshot,base,0.651,0.648,0.697
header_content_zeroshot,base,0.631,0.631,0.665
header_content_zeroshot,consecutive,0.626,0.631,0.665
header_plus_content_zeroshot,consecutive,0.624,0.556,0.698
header_plus_content_zeroshot,base,0.617,0.556,0.698
header_plus_textsim,base,0.602,0.613,0.668
header_plus_textsim,consecutive,0.597,0.613,0.668
header_plus_zeroshot,base,0.594,0.54,0.608
header_plus_zeroshot,consecutive,0.587,0.54,0.608


In [8]:
analysis["consecutive"] = analysis.type.apply(
    lambda x: "consecutive" if "consecutive" in x else "non-consecutive")


In [9]:
analysis.groupby(["method", "labelling_method"]).mean(
    numeric_only=True).round(3)


Unnamed: 0_level_0,Unnamed: 1_level_0,correlation,kappa,accuracy
method,labelling_method,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
textsim,content,0.462,0.429,0.634
textsim,grouped,0.495,0.462,0.656
textsim,header,0.436,0.411,0.624
textsim,header content,0.484,0.422,0.646
textsim,header plus,0.501,0.485,0.638
textsim,header plus content,0.492,0.451,0.661
zeroshot,content,0.508,0.467,0.636
zeroshot,grouped,0.52,0.524,0.672
zeroshot,header,0.373,0.391,0.575
zeroshot,header content,0.511,0.528,0.644


In [10]:
analysis[analysis.type.isin(["grouped", "base"])].groupby(
    "type").mean(numeric_only=True).round(3).reset_index()


Unnamed: 0,type,correlation,kappa,accuracy
0,base,0.571,0.547,0.663
1,grouped,0.396,0.366,0.611


In [11]:
analysis.groupby(["method"]).mean(
    numeric_only=True).round(3).reset_index()


Unnamed: 0,method,correlation,kappa,accuracy
0,textsim,0.478,0.443,0.643
1,zeroshot,0.486,0.469,0.632


In [12]:
analysis.groupby(["type"]).mean(numeric_only=True).round(3).reset_index()


Unnamed: 0,type,correlation,kappa,accuracy
0,base,0.571,0.547,0.663
1,consecutive,0.568,0.547,0.663
2,grouped,0.396,0.366,0.611
3,grouped_consecutive,0.394,0.366,0.611


In [13]:
analysis.groupby("labelling_method").mean( 
    numeric_only=True).round(3).style.highlight_max(color="green", axis=0).format("{:.3f}")

Unnamed: 0_level_0,correlation,kappa,accuracy
labelling_method,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
content,0.485,0.448,0.635
grouped,0.508,0.493,0.664
header,0.404,0.401,0.6
header content,0.498,0.475,0.645
header plus,0.503,0.471,0.614
header plus content,0.494,0.45,0.667


In [18]:
analysis[(analysis.type.isin(["base"])) & (
    analysis.labelling_method.isin(["header plus content"]))]


Unnamed: 0,correlation,kappa,accuracy,labelling_method,type,method,consecutive
24,0.568,0.528,0.692,header plus content,base,textsim,non-consecutive
28,0.617,0.556,0.698,header plus content,base,zeroshot,non-consecutive
