In [55]:
import glob
import pandas as pd
from e2e_system.reproder import _ground_truth
from sklearn.metrics import cohen_kappa_score
from sklearn.metrics import precision_recall_fscore_support, accuracy_score
from util.file_helper import read_all_csv_files


In [56]:
def get_correlation(df, round_step=3):
    crr = df.score.corr(other=df.reprod_score)
    return round(crr, round_step)


def get_kappa(df):
    return round(cohen_kappa_score(df.score, df.reprod_score, weights="quadratic"), 3)


In [57]:
evaluations = glob.glob(
    "data/paperswithcode/evaluated/hierarchical/**auto**")


In [64]:
evaluations


['data/paperswithcode/evaluated/hierarchical\\autolabelled']

In [70]:
def get_evaluation_metric_results(true, pred):
    accuracy = accuracy_score(true, pred)
    return round(accuracy, 3)


In [71]:
res = []
for evaluation in evaluations:
    results = read_all_csv_files(f"{evaluation}/")
    for key in results.keys():
        correlation = get_correlation(results[key])
        kappa = get_kappa(
            results[key]) if key == "direct_classification" else 0
        evaluation = evaluation.split("\\")[-1]
        res.append(
            {
                "correlation": correlation,
                "kappa": kappa,
                "accuracy":  get_evaluation_metric_results(results[key].score, results[key].reprod_score) if key == "direct_classification" else 0,
                "evaluation": evaluation,
                "type": key.replace("neurips_", "")
            }
        )


In [72]:
analysis = pd.DataFrame(res)
analysis["data_filter"] = analysis.evaluation.apply(
    lambda x: "filtered" if x.endswith("_limited") else "non-filtered")


In [73]:
pd.DataFrame(res).set_index(["evaluation", "type"]).round(3).style.highlight_max(
    color="green", axis=0).format("{:.3f}")


Unnamed: 0_level_0,Unnamed: 1_level_0,correlation,kappa,accuracy
evaluation,type,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
autolabelled,direct_classification,0.495,0.404,0.3
autolabelled,direct_coeff,0.331,0.0,0.0
autolabelled,weighted_score,0.534,0.0,0.0


In [62]:
pd.DataFrame(res).set_index(["evaluation", "type"]).round(3).sort_values(by="correlation", ascending=False).style.highlight_max(
    color="green", axis=0).format("{:.3f}")


Unnamed: 0_level_0,Unnamed: 1_level_0,correlation,kappa
evaluation,type,Unnamed: 2_level_1,Unnamed: 3_level_1
autolabelled,weighted_score,0.534,0.0
autolabelled,direct_classification,0.495,0.404
autolabelled,direct_coeff,0.331,0.0


In [63]:
analysis.groupby("data_filter").mean(numeric_only=True).round(3)


Unnamed: 0_level_0,correlation,kappa
data_filter,Unnamed: 1_level_1,Unnamed: 2_level_1
non-filtered,0.453,0.135
