In [4]:
from sklearn.metrics import confusion_matrix, precision_recall_fscore_support, accuracy_score, ConfusionMatrixDisplay
import pandas as pd
from e2e_system.reproder import _ground_truth
import os


In [5]:
exp_main_dir = "data/paperswithcode/evaluated/classification"
exp_name = "bert_base_uncased/header_plus_content_textsim"
exp_dir = os.path.join(exp_main_dir, exp_name)


In [7]:
base = pd.read_csv(os.path.join(exp_dir, 'neurips_base.csv'))
grouped = pd.read_csv(os.path.join(
    exp_dir, 'neurips_grouped.csv'))
consecutive = pd.read_csv(os.path.join(
    exp_dir, 'neurips_consecutive.csv'))
grouped_consecutive = pd.read_csv(os.path.join(
    exp_dir, 'neurips_grouped_consecutive.csv'))


### Reproducibility Score Analysis


In [9]:
def analyze_reprod_score(df, type):
    df = df[["true_reprod_score", "reprod_score"]].describe()
    df["diff"] = df["reprod_score"] - df["true_reprod_score"]
    df = df.T.round(3)
    df.set_index([[type, type, type], [
        "true", "pred", "diff"]], inplace=True)
    return df


#### Correlation


In [12]:
base_score_corr = base[["stars", "true_reprod_score", "reprod_score"]].corr()
grouped_score_corr = grouped[[
    "stars", "true_reprod_score", "reprod_score"]].corr()
consecutive_score_corr = consecutive[[
    "stars", "true_reprod_score", "reprod_score"]].corr()
grouped_consecutive_score_corr = grouped_consecutive[[
    "stars", "true_reprod_score", "reprod_score"]].corr()
pd.concat([base_score_corr, grouped_score_corr, consecutive_score_corr, grouped_consecutive_score_corr], keys=[
          "base", "grouped", "consecutive", "grouped_consecutive"]).round(3)


Unnamed: 0,Unnamed: 1,stars,true_reprod_score,reprod_score
base,stars,1.0,0.023,-0.013
base,true_reprod_score,0.023,1.0,0.568
base,reprod_score,-0.013,0.568,1.0
grouped,stars,1.0,0.023,-0.116
grouped,true_reprod_score,0.023,1.0,0.417
grouped,reprod_score,-0.116,0.417,1.0
consecutive,stars,1.0,0.023,-0.003
consecutive,true_reprod_score,0.023,1.0,0.569
consecutive,reprod_score,-0.003,0.569,1.0
grouped_consecutive,stars,1.0,0.023,-0.113


### Classification Score Analysis


In [13]:
def analyze_classification_score(df):
    score_columns = [
        column for column in df.columns if "score" in column and "reprod" not in column]
    dct = {}
    for column in score_columns:
        label_df = df[df[column] != 0][column]
        desc = label_df.describe()
        dct[column] = desc

    all_scores = df[score_columns].values.flatten()
    all_scores = all_scores[all_scores != 0]
    return dct, all_scores.mean().round(3)


In [14]:
base_classification_desc, base_classification_mean = analyze_classification_score(
    base)
grouped_classification_desc, grouped_classification_mean = analyze_classification_score(
    grouped)

pd.concat([pd.DataFrame(base_classification_desc).T, pd.DataFrame(grouped_classification_desc).T], keys=[
    "base", "grouped"]).style.background_gradient(cmap='Blues', axis=0).format("{:.3f}")


Unnamed: 0,Unnamed: 1,count,mean,std,min,25%,50%,75%,max
base,score,100.0,3.9,1.259,1.0,3.0,4.0,5.0,6.0
base,introduction_score,78.0,0.747,0.118,0.33,0.67,0.774,0.836,0.94
base,requirements_score,70.0,0.823,0.14,0.427,0.729,0.836,0.953,0.987
base,pretrained_model_score,34.0,0.716,0.152,0.354,0.62,0.724,0.844,0.958
base,training_score,81.0,0.732,0.143,0.402,0.631,0.759,0.838,0.968
base,evaluation_score,68.0,0.661,0.125,0.397,0.561,0.653,0.757,0.904
base,results_score,86.0,0.684,0.15,0.265,0.607,0.711,0.805,0.931
grouped,score,100.0,3.9,1.259,1.0,3.0,4.0,5.0,6.0
grouped,introduction_score,71.0,0.758,0.123,0.428,0.67,0.779,0.852,0.94
grouped,requirements_score,57.0,0.84,0.158,0.427,0.732,0.892,0.97,0.987


In [None]:
print("Classification score means")
print(f"base: {base_classification_mean}, grouped: {grouped_classification_mean}")


### Accuracy Analysis


In [15]:
def evaluate(true, pred):
    confusion_mtx = confusion_matrix(true, pred)
    accuracy = accuracy_score(true, pred)
    precision, recall, f1, support = precision_recall_fscore_support(
        true, pred, average='binary')
    return accuracy, precision, recall, f1


def eval_df(df):
    labelled = df.select_dtypes(include=["bool"])
    pred_columns = [
        column for column in labelled.columns if column.endswith("_pred")]
    true_columns = [
        column for column in labelled.columns if not column.endswith("_pred")]
    pred = labelled[pred_columns].values.flatten()
    true = labelled[true_columns].values.flatten()
    return evaluate(true, pred)


In [16]:
scores_df = pd.DataFrame(columns=["accuracy", "precision", "recall", "f1"])
scores_df.loc["base"] = eval_df(base)
scores_df.loc["grouped"] = eval_df(grouped)
scores_df.style.highlight_max(color="green", axis=0).format("{:.4f}")


Unnamed: 0,accuracy,precision,recall,f1
base,0.6917,0.7458,0.7974,0.7708
grouped,0.63,0.7456,0.6538,0.6967


In [None]:
def evaluate_on_label(df) -> pd.DataFrame:
    scores_by_label = pd.DataFrame(
        columns=["accuracy", "precision", "recall", "f1"])
    for label in _ground_truth:
        if label == "introduction":
            continue
        true = df.loc[:, label].values.flatten().tolist()
        pred = df.loc[:, f"{label}_pred"].values.flatten().tolist()
        scores_by_label.loc[label] = evaluate(true, pred)
    return scores_by_label


In [None]:
pd.concat([evaluate_on_label(base), evaluate_on_label(grouped)], keys=[
    "base", "grouped"]).style.highlight_max(color="green", axis=0).format("{:.4f}")


### Exact Match Analysis


In [None]:
def eval_exact_matches(df):
    labelled = df.select_dtypes(include=["bool"])
    pred_columns = [
        column for column in labelled.columns if column.endswith("_pred")]
    true_columns = [
        column for column in labelled.columns if not column.endswith("_pred")]
    pred = labelled[pred_columns].values
    true = labelled[true_columns].values
    exact_match = 0
    true_match = 0
    for readme_pred, readme_true in zip(pred, true):
        if (readme_pred == readme_true).all():
            exact_match += 1
        if readme_pred.sum() == readme_true.sum():
            true_match += 1
    return round(exact_match/len(pred), 3), round(true_match/len(pred), 3)


In [None]:
exact_matches_df = {"base": eval_exact_matches(base)}
exact_matches_df["grouped"] = eval_exact_matches(grouped)
pd.DataFrame(exact_matches_df).set_index(pd.Index(
    ["exact_match_ratio", "true_match_ratio"])).T.style.highlight_max(color="green", axis=0).format("{:.4f}")
