In [2]:
import numpy as np
from src.run_shap import load_shap_vals
import pickle
from tqdm import tqdm
from src.utils import token_segments
from src.utils import legacy_get_dataset_info
from transformers import AutoTokenizer
import shap
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
from matplotlib import rcParams
from src.utils import legacy_get_dataset_info

  from .autonotebook import tqdm as notebook_tqdm


## Kendalls tau

In [3]:
results = {}
tab_scale_factor = 1
rows = []
for ds_name in [
    "prod_sent",
    "kick",
    "jigsaw",
    "wine",
    "fake",
    "imdb_genre",
    "channel",
    "airbnb",
    "salary",
]:  #
    for text_model_code in [
        "disbert",
        "bert",
        "drob",
        "deberta",
    ]:
        # summary_plot(ds_name)
        di = legacy_get_dataset_info(ds_name)
        tab_pre = f"_sf{tab_scale_factor}" if tab_scale_factor != 2 else ""
        vals = []
        for name in [
            "ensemble_25",
            "ensemble_50",
            "ensemble_75",
            "stack",
            "all_text",
        ]:
            filepath = f"../models/shap_vals_{text_model_code}{tab_pre}/{ds_name}/summed_shap_vals_{name}.pkl"
            with open(filepath, "rb") as f:
                grouped_shap_vals = pickle.load(f)
            abs_ft = np.sum(np.abs(grouped_shap_vals), axis=0)
            cols = [f"(Tab) {col}" for col in di.tab_cols] + [
                f"(Text) {col}" for col in di.text_cols
            ]
            vals.append(abs_ft)

        name = "all_text_baseline"
        col_name_filepath = f"../models/shap_vals_{text_model_code}{tab_pre}/{ds_name}/col_names_shap_vals_all_text_baseline.pkl"
        colon_filepath = f"../models/shap_vals_{text_model_code}{tab_pre}/{ds_name}/colon_shap_vals_all_text_baseline.pkl"
        fts_filepath = f"../models/shap_vals_{text_model_code}{tab_pre}/{ds_name}/summed_shap_vals_all_text_baseline.pkl"

        with open(col_name_filepath, "rb") as f:
            grouped_col_name_shap_vals = pickle.load(f)
        with open(colon_filepath, "rb") as f:
            grouped_colon_shap_vals = pickle.load(f)
        with open(fts_filepath, "rb") as f:
            grouped_shap_vals = pickle.load(f)
        val_only = (
            grouped_shap_vals - grouped_col_name_shap_vals - grouped_colon_shap_vals
        )
        grouped_template = np.mean(
            grouped_shap_vals - val_only, axis=2, keepdims=True)

        cols = [f"(Tab) {col}" for col in di.tab_cols] + [
            f"(Text) {col}" for col in di.text_cols
        ]
        abs_ft = np.sum(np.abs(grouped_shap_vals), axis=0)
        vals.append(abs_ft)
        rows.append([ds_name, text_model_code, vals])
pre_tau_df = pd.DataFrame(
    rows,
    columns=["ds_name", "text_model", "vals"],
)
# unranked_df_no_template.to_csv("unranked_df_no_template.csv", index=False)

No model type specified for prod_sent. (This is fine during dataset creation)
No model type specified for kick. (This is fine during dataset creation)
No model type specified for jigsaw. (This is fine during dataset creation)
No model type specified for wine. (This is fine during dataset creation)
No model type specified for fake. (This is fine during dataset creation)
No model type specified for imdb_genre. (This is fine during dataset creation)
No model type specified for channel. (This is fine during dataset creation)
No model type specified for airbnb. (This is fine during dataset creation)
No model type specified for salary. (This is fine during dataset creation)


In [4]:
# Now for the whole tau_df
from scipy import stats

methods = [
    "ensemble_25",
    "ensemble_50",
    "ensemble_75",
    "stack",
    "all_text",
    "all_text_baseline",
]
method_comparions = []
for row in pre_tau_df.itertuples():
    for i, method in enumerate(methods):
        for j, method2 in enumerate(methods):
            if i < j:
                taus = []
                p_values = []
                vals = np.stack(row.vals, axis=2)
                for instance in vals:
                    tau, p_value = stats.kendalltau(
                        instance[:, i], instance[:, j])
                    taus.append(tau)
                    p_values.append(p_value)
                method_comparions.append(
                    [
                        row.ds_name,
                        row.text_model,
                        method + " vs " + method2,
                        np.mean(taus),
                        np.std(taus),
                        taus,
                        p_values,
                    ]
                )

In [5]:
tau_df = pd.DataFrame(
    method_comparions,
    columns=[
        "ds_name",
        "text_model",
        "methodA_vs_methodB",
        "mean_tau",
        "std_tau",
        "taus",
        "p_values",
    ],
)
# tau_df.to_csv("tau_df.csv", index=False)

In [39]:
tau_df

Unnamed: 0,ds_name,text_model,methodA_vs_methodB,mean_tau,std_tau,taus,p_values,mean (std)
0,prod_sent,disbert,ensemble_25 vs ensemble_50,-0.060000,0.998198,"[-1.0, -1.0, -1.0, 1.0, -1.0, 1.0, -1.0, -1.0,...","[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, ...",-0.06
1,prod_sent,disbert,ensemble_25 vs ensemble_75,-0.940000,0.341174,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1....","[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, ...",-0.94
2,prod_sent,disbert,ensemble_25 vs stack,1.000000,0.000000,"[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, ...","[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, ...",1.00
3,prod_sent,disbert,ensemble_25 vs all_text,1.000000,0.000000,"[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, ...","[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, ...",1.00
4,prod_sent,disbert,ensemble_25 vs all_text_baseline,0.620000,0.784602,"[1.0, 1.0, 1.0, 1.0, -1.0, -1.0, 1.0, 1.0, -1....","[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, ...",0.62
...,...,...,...,...,...,...,...,...
535,salary,deberta,ensemble_75 vs all_text,0.556000,0.234041,"[0.4666666666666666, 0.6, 0.7333333333333333, ...","[0.2722222222222222, 0.1361111111111111, 0.055...",0.56
536,salary,deberta,ensemble_75 vs all_text_baseline,0.640000,0.195505,"[0.2, 0.7333333333333333, 0.8666666666666666, ...","[0.7194444444444444, 0.05555555555555555, 0.01...",0.64
537,salary,deberta,stack vs all_text,0.466667,0.275197,"[0.9999999999999999, 0.4666666666666666, 0.866...","[0.002777777777777778, 0.2722222222222222, 0.0...",0.47
538,salary,deberta,stack vs all_text_baseline,0.513333,0.248283,"[0.7333333333333333, 0.3333333333333333, 0.733...","[0.05555555555555555, 0.46944444444444444, 0.0...",0.51


In [6]:
all_comparsions = {}
for row in tau_df.itertuples():
    # Exclude the following:
    # 1. ds_name == "channel" and method in ["all_text_baseline", "all_text"]
    # 2. ds_name == "salary" and method == "stack"
    # 3. ds_name == "wine" and method == "stack"
    # 4. ds_name == "prod_sent" and method == "stack" and text_model == "disbert"
    if row.ds_name == "channel" and "all_text" in row.methodA_vs_methodB:
        continue
    if row.ds_name in ("salary", "wine") and "stack" in row.methodA_vs_methodB:
        continue
    if (
        row.ds_name == "prod_sent"
        and "stack" in row.methodA_vs_methodB
        and row.text_model == "disbert"
    ):
        continue
    if row.methodA_vs_methodB not in all_comparsions:
        all_comparsions[row.methodA_vs_methodB] = []
    all_comparsions[row.methodA_vs_methodB].extend(row.taus)

In [7]:
all_comparsions_summary = {}
for key, val in all_comparsions.items():
    all_comparsions_summary[key] = [np.mean(val), np.std(val), len(val)]

In [8]:
summary_df = pd.DataFrame(all_comparsions_summary).T
summary_df.columns = ["mean_tau", "std_tau", "n"]

In [9]:
summary_df["mean (std)"] = summary_df.apply(
    lambda row: f"{row.mean_tau:.2f} ({row.std_tau:.2f})", axis=1
)

In [10]:
summary_df

Unnamed: 0,mean_tau,std_tau,n,mean (std)
ensemble_25 vs ensemble_50,0.780177,0.368787,3600.0,0.78 (0.37)
ensemble_25 vs ensemble_75,0.581361,0.500739,3600.0,0.58 (0.50)
ensemble_25 vs all_text,0.441283,0.414133,3200.0,0.44 (0.41)
ensemble_25 vs all_text_baseline,0.307601,0.449035,3200.0,0.31 (0.45)
ensemble_50 vs ensemble_75,0.789799,0.394544,3600.0,0.79 (0.39)
ensemble_50 vs all_text,0.444798,0.47489,3200.0,0.44 (0.47)
ensemble_50 vs all_text_baseline,0.360699,0.470265,3200.0,0.36 (0.47)
ensemble_75 vs all_text,0.365253,0.543899,3200.0,0.37 (0.54)
ensemble_75 vs all_text_baseline,0.367927,0.478542,3200.0,0.37 (0.48)
all_text vs all_text_baseline,0.446403,0.447058,3200.0,0.45 (0.45)


## Kendalls tau, split by text models

In [11]:
results = {}
tab_scale_factor = 1
rows = []
for ds_name in [
    "prod_sent",
    "kick",
    "jigsaw",
    "wine",
    "fake",
    "imdb_genre",
    "channel",
    "airbnb",
    "salary",
]:  #
    for name in [
        "ensemble_25",
        "ensemble_50",
        "ensemble_75",
        "stack",
        "all_text",
    ]:
        # summary_plot(ds_name)
        di = legacy_get_dataset_info(ds_name)
        tab_pre = f"_sf{tab_scale_factor}" if tab_scale_factor != 2 else ""
        vals = []
        for text_model_code in [
            "disbert",
            "bert",
            "drob",
            "deberta",
        ]:
            filepath = f"../models/shap_vals_{text_model_code}{tab_pre}/{ds_name}/summed_shap_vals_{name}.pkl"
            with open(filepath, "rb") as f:
                grouped_shap_vals = pickle.load(f)
            abs_ft = np.sum(np.abs(grouped_shap_vals), axis=0)
            cols = [f"(Tab) {col}" for col in di.tab_cols] + [
                f"(Text) {col}" for col in di.text_cols
            ]
            vals.append(abs_ft)
        rows.append([ds_name, name, vals])

    name = "all_text_baseline"
    for text_model_code in [
        "disbert",
        "bert",
        "drob",
        "deberta",
    ]:
        col_name_filepath = f"../models/shap_vals_{text_model_code}{tab_pre}/{ds_name}/col_names_shap_vals_all_text_baseline.pkl"
        colon_filepath = f"../models/shap_vals_{text_model_code}{tab_pre}/{ds_name}/colon_shap_vals_all_text_baseline.pkl"
        fts_filepath = f"../models/shap_vals_{text_model_code}{tab_pre}/{ds_name}/summed_shap_vals_all_text_baseline.pkl"

        with open(col_name_filepath, "rb") as f:
            grouped_col_name_shap_vals = pickle.load(f)
        with open(colon_filepath, "rb") as f:
            grouped_colon_shap_vals = pickle.load(f)
        with open(fts_filepath, "rb") as f:
            grouped_shap_vals = pickle.load(f)
        val_only = (
            grouped_shap_vals - grouped_col_name_shap_vals - grouped_colon_shap_vals
        )
        grouped_template = np.mean(
            grouped_shap_vals - val_only, axis=2, keepdims=True)

        cols = [f"(Tab) {col}" for col in di.tab_cols] + [
            f"(Text) {col}" for col in di.text_cols
        ]
        abs_ft = np.sum(np.abs(grouped_shap_vals), axis=0)
        vals.append(abs_ft)
    rows.append([ds_name, name, vals])
pre_tau_df2 = pd.DataFrame(
    rows,
    columns=["ds_name", "method", "vals"],
)
# unranked_df_no_template.to_csv("unranked_df_no_template.csv", index=False)

No model type specified for prod_sent. (This is fine during dataset creation)
No model type specified for kick. (This is fine during dataset creation)
No model type specified for jigsaw. (This is fine during dataset creation)
No model type specified for wine. (This is fine during dataset creation)
No model type specified for fake. (This is fine during dataset creation)
No model type specified for imdb_genre. (This is fine during dataset creation)
No model type specified for channel. (This is fine during dataset creation)
No model type specified for airbnb. (This is fine during dataset creation)
No model type specified for salary. (This is fine during dataset creation)


In [12]:
np.stack(pre_tau_df2.iloc[0].vals).shape

(4, 100, 2)

In [13]:
pre_tau_df2

Unnamed: 0,ds_name,method,vals
0,prod_sent,ensemble_25,"[[[0.5008984502820633, 0.274237389327027], [0...."
1,prod_sent,ensemble_50,"[[[0.3339323001880409, 0.548474778654053], [0...."
2,prod_sent,ensemble_75,"[[[0.16696615009402102, 0.8227121679810816], [..."
3,prod_sent,stack,"[[[0.6967018983048695, 0.021057132623985314], ..."
4,prod_sent,all_text,"[[[0.6826972063363064, 0.18024434113060123], [..."
5,prod_sent,all_text_baseline,"[[[0.6826972063363064, 0.18024434113060123], [..."
6,kick,ensemble_25,"[[[0.13595617524406073, 0.0, 0.008954340475157..."
7,kick,ensemble_50,"[[[0.09063745016270844, 0.0, 0.003443258992492..."
8,kick,ensemble_75,"[[[0.001484591677663449, 0.001484591677663449,..."
9,kick,stack,"[[[0.08750044316429603, 0.0005075893585239008,..."


In [14]:
# Now for the whole tau_df
from scipy import stats

text_models = [
    "disbert",
    "bert",
    "drob",
    "deberta",
]
model_comparions = []
for row in pre_tau_df2.itertuples():
    for i, model in enumerate(text_models):
        for j, model2 in enumerate(text_models):
            if i < j:
                taus = []
                p_values = []
                vals = np.stack(row.vals, axis=2)
                for instance in vals:
                    tau, p_value = stats.kendalltau(
                        instance[:, i], instance[:, j])
                    taus.append(tau)
                    p_values.append(p_value)
                model_comparions.append(
                    [
                        row.ds_name,
                        row.method,
                        model + " vs " + model2,
                        np.mean(taus),
                        np.std(taus),
                        taus,
                        p_values,
                    ]
                )

In [15]:
tau_df2 = pd.DataFrame(
    model_comparions,
    columns=[
        "ds_name",
        "method",
        "modelA_vs_modelB",
        "mean_tau",
        "std_tau",
        "taus",
        "p_values",
    ],
)
tau_df2.to_csv("tau_df_model.csv", index=False)

In [16]:
tau_df2

Unnamed: 0,ds_name,method,modelA_vs_modelB,mean_tau,std_tau,taus,p_values
0,prod_sent,ensemble_25,disbert vs bert,1.000000,0.000000,"[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, ...","[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, ..."
1,prod_sent,ensemble_25,disbert vs drob,1.000000,0.000000,"[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, ...","[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, ..."
2,prod_sent,ensemble_25,disbert vs deberta,1.000000,0.000000,"[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, ...","[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, ..."
3,prod_sent,ensemble_25,bert vs drob,1.000000,0.000000,"[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, ...","[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, ..."
4,prod_sent,ensemble_25,bert vs deberta,1.000000,0.000000,"[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, ...","[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, ..."
...,...,...,...,...,...,...,...
319,salary,all_text_baseline,disbert vs drob,0.417333,0.302673,"[0.4666666666666666, 0.2, 0.06666666666666665,...","[0.2722222222222222, 0.7194444444444444, 1.0, ..."
320,salary,all_text_baseline,disbert vs deberta,0.525333,0.261497,"[0.7333333333333333, 0.6, 0.3333333333333333, ...","[0.05555555555555555, 0.1361111111111111, 0.46..."
321,salary,all_text_baseline,bert vs drob,0.346667,0.285657,"[0.2, 0.3333333333333333, -0.4666666666666666,...","[0.7194444444444444, 0.46944444444444444, 0.27..."
322,salary,all_text_baseline,bert vs deberta,0.569333,0.242839,"[0.7333333333333333, 0.4666666666666666, -0.2,...","[0.05555555555555555, 0.2722222222222222, 0.71..."


In [17]:
# Exclude the following from tau_df2:
# 1. ds_name == "channel" and method in ["all_text_baseline", "all_text"]
mask1 = (tau_df2["ds_name"] == "channel") & (
    tau_df2["method"].isin(["all_text_baseline", "all_text"])
)

# 2. ds_name == "salary" and method == "stack"
mask2 = (tau_df2["ds_name"] == "salary") & (tau_df2["method"] == "stack")

# 3. ds_name == "wine" and method == "stack"
mask3 = (tau_df2["ds_name"] == "wine") & (tau_df2["method"] == "stack")

# 4. ds_name == "prod_sent"
# row.ds_name == "prod_sent" and "stack" in row.methodA_vs_methodB and row.text_model == "disbert":
mask4 = (
    (tau_df2["ds_name"] == "prod_sent")
    & (tau_df2["method"] == "stack")
    & (tau_df2["modelA_vs_modelB"].str.contains("disbert"))
)

exclude_mask = mask1 | mask2 | mask3 | mask4
tau_df2 = tau_df2.loc[~exclude_mask]


all_comparsions = {}
for row in tau_df2.itertuples():
    if row.modelA_vs_modelB not in all_comparsions:
        all_comparsions[row.modelA_vs_modelB] = []
    all_comparsions[row.modelA_vs_modelB].extend(row.taus)

In [18]:
pd.DataFrame(all_comparsions).describe()

ValueError: All arrays must be of the same length

In [None]:
all_comparsions_summary = {}
for key, val in all_comparsions.items():
    all_comparsions_summary[key] = [np.mean(val), np.std(val), len(val)]
summary_df = pd.DataFrame(all_comparsions_summary).T
summary_df.columns = ["mean_tau", "std_tau", "n"]
summary_df["mean (std)"] = summary_df.apply(
    lambda row: f"{row.mean_tau:.2f} ({row.std_tau:.2f})", axis=1
)

In [None]:
summary_df

Unnamed: 0,mean_tau,std_tau,n,mean (std)
disbert vs bert,0.747003,0.267183,4400.0,0.75 (0.27)
disbert vs drob,0.729292,0.285062,4400.0,0.73 (0.29)
disbert vs deberta,0.699818,0.284427,4400.0,0.70 (0.28)
bert vs drob,0.714736,0.29156,4400.0,0.71 (0.29)
bert vs deberta,0.689424,0.283417,4400.0,0.69 (0.28)
drob vs deberta,0.689842,0.301456,4400.0,0.69 (0.30)


In [19]:
tau_df2["mean (std)"] = tau_df2.apply(
    lambda row: f"{row.mean_tau:.2f} ({row.std_tau:.2f})", axis=1
)
tau_df2["mean (std)"] = tau_df2.apply(
    lambda row: f"{row.mean_tau:.2f}", axis=1)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [20]:
tau_df2

Unnamed: 0,ds_name,method,modelA_vs_modelB,mean_tau,std_tau,taus,p_values,mean (std)
0,prod_sent,ensemble_25,disbert vs bert,1.000000,0.000000,"[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, ...","[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, ...",1.00
1,prod_sent,ensemble_25,disbert vs drob,1.000000,0.000000,"[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, ...","[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, ...",1.00
2,prod_sent,ensemble_25,disbert vs deberta,1.000000,0.000000,"[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, ...","[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, ...",1.00
3,prod_sent,ensemble_25,bert vs drob,1.000000,0.000000,"[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, ...","[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, ...",1.00
4,prod_sent,ensemble_25,bert vs deberta,1.000000,0.000000,"[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, ...","[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, ...",1.00
...,...,...,...,...,...,...,...,...
319,salary,all_text_baseline,disbert vs drob,0.417333,0.302673,"[0.4666666666666666, 0.2, 0.06666666666666665,...","[0.2722222222222222, 0.7194444444444444, 1.0, ...",0.42
320,salary,all_text_baseline,disbert vs deberta,0.525333,0.261497,"[0.7333333333333333, 0.6, 0.3333333333333333, ...","[0.05555555555555555, 0.1361111111111111, 0.46...",0.53
321,salary,all_text_baseline,bert vs drob,0.346667,0.285657,"[0.2, 0.3333333333333333, -0.4666666666666666,...","[0.7194444444444444, 0.46944444444444444, 0.27...",0.35
322,salary,all_text_baseline,bert vs deberta,0.569333,0.242839,"[0.7333333333333333, 0.4666666666666666, -0.2,...","[0.05555555555555555, 0.2722222222222222, 0.71...",0.57


In [21]:
def get_model_comp_tbl(ds_name, method):
    mask = (tau_df2["ds_name"] == ds_name) & (tau_df2["method"] == method)
    df = tau_df2.loc[mask]

    def get_value(model_a, model_b):
        query = df["modelA_vs_modelB"] == f"{model_a} vs {model_b}"
        if any(query):
            return df.loc[query]["mean (std)"].values[0]
        else:
            return "NaN"

    row0 = [get_value("disbert", "bert"), "", ""]
    row1 = [get_value("disbert", "drob"), get_value("bert", "drob"), ""]
    row2 = [
        get_value("disbert", "deberta"),
        get_value("bert", "deberta"),
        get_value("drob", "deberta"),
    ]

    return pd.DataFrame(
        [row0, row1, row2],
        columns=["Dis.B", "BERT", "Dis.R"],
        index=["BERT", "Dis.R", "DeB."],
    )

In [22]:
df = get_model_comp_tbl("airbnb", "all_text_baseline")

In [23]:
# concat horizontally
for ds in [
    "prod_sent",
    "kick",
    "jigsaw",
    "wine",
    "fake",
    "imdb_genre",
    "channel",
    "airbnb",
    "salary",
]:
    all_df = pd.concat(
        [
            get_model_comp_tbl(ds, method)
            for method in [
                "all_text_baseline",
                "all_text",
                "ensemble_25",
                "ensemble_50",
                "ensemble_75",
                "stack",
            ]
        ],
        axis=1,
    )
    with open(f"results/tau_model_comp_{ds}.csv", "w") as f:
        all_df.to_csv(f)

# for method in ['all_text_baseline', 'all_text', 'ensemble_25', 'ensemble_50', 'ensemble_75', 'stack']:
#     print(get_model_comp_tbl('airbnb', method))

In [None]:
tau_df

Unnamed: 0,ds_name,text_model,methodA_vs_methodB,mean_tau,std_tau,taus,p_values
0,prod_sent,disbert,ensemble_25 vs ensemble_50,-0.060000,0.998198,"[-1.0, -1.0, -1.0, 1.0, -1.0, 1.0, -1.0, -1.0,...","[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, ..."
1,prod_sent,disbert,ensemble_25 vs ensemble_75,-0.940000,0.341174,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1....","[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, ..."
2,prod_sent,disbert,ensemble_25 vs stack,1.000000,0.000000,"[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, ...","[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, ..."
3,prod_sent,disbert,ensemble_25 vs all_text,1.000000,0.000000,"[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, ...","[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, ..."
4,prod_sent,disbert,ensemble_25 vs all_text_baseline,0.620000,0.784602,"[1.0, 1.0, 1.0, 1.0, -1.0, -1.0, 1.0, 1.0, -1....","[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, ..."
...,...,...,...,...,...,...,...
535,salary,deberta,ensemble_75 vs all_text,0.556000,0.234041,"[0.4666666666666666, 0.6, 0.7333333333333333, ...","[0.2722222222222222, 0.1361111111111111, 0.055..."
536,salary,deberta,ensemble_75 vs all_text_baseline,0.640000,0.195505,"[0.2, 0.7333333333333333, 0.8666666666666666, ...","[0.7194444444444444, 0.05555555555555555, 0.01..."
537,salary,deberta,stack vs all_text,0.466667,0.275197,"[0.9999999999999999, 0.4666666666666666, 0.866...","[0.002777777777777778, 0.2722222222222222, 0.0..."
538,salary,deberta,stack vs all_text_baseline,0.513333,0.248283,"[0.7333333333333333, 0.3333333333333333, 0.733...","[0.05555555555555555, 0.46944444444444444, 0.0..."


In [None]:
tau_df

Unnamed: 0,ds_name,text_model,methodA_vs_methodB,mean_tau,std_tau,taus,p_values
0,prod_sent,disbert,ensemble_25 vs ensemble_50,-0.060000,0.998198,"[-1.0, -1.0, -1.0, 1.0, -1.0, 1.0, -1.0, -1.0,...","[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, ..."
1,prod_sent,disbert,ensemble_25 vs ensemble_75,-0.940000,0.341174,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1....","[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, ..."
2,prod_sent,disbert,ensemble_25 vs stack,1.000000,0.000000,"[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, ...","[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, ..."
3,prod_sent,disbert,ensemble_25 vs all_text,1.000000,0.000000,"[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, ...","[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, ..."
4,prod_sent,disbert,ensemble_25 vs all_text_baseline,0.620000,0.784602,"[1.0, 1.0, 1.0, 1.0, -1.0, -1.0, 1.0, 1.0, -1....","[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, ..."
...,...,...,...,...,...,...,...
535,salary,deberta,ensemble_75 vs all_text,0.556000,0.234041,"[0.4666666666666666, 0.6, 0.7333333333333333, ...","[0.2722222222222222, 0.1361111111111111, 0.055..."
536,salary,deberta,ensemble_75 vs all_text_baseline,0.640000,0.195505,"[0.2, 0.7333333333333333, 0.8666666666666666, ...","[0.7194444444444444, 0.05555555555555555, 0.01..."
537,salary,deberta,stack vs all_text,0.466667,0.275197,"[0.9999999999999999, 0.4666666666666666, 0.866...","[0.002777777777777778, 0.2722222222222222, 0.0..."
538,salary,deberta,stack vs all_text_baseline,0.513333,0.248283,"[0.7333333333333333, 0.3333333333333333, 0.733...","[0.05555555555555555, 0.46944444444444444, 0.0..."


In [26]:
tau_df["mean (std)"] = tau_df.apply(lambda row: f"{row.mean_tau:.2f}", axis=1)

In [40]:
def get_method_comp_tbl(ds_name, model):
    mask = (tau_df["ds_name"] == ds_name) & (tau_df["text_model"] == model)
    df = tau_df.loc[mask]

    def get_value(method_a, method_b):
        query = df["methodA_vs_methodB"] == f"{method_a} vs {method_b}"
        if any(query):
            return df.loc[query]["mean (std)"].values[0]
        else:
            return "NaN"

    row0 = [get_value("all_text", "all_text_baseline"), "", "", "", ""]
    row1 = [
        get_value("ensemble_25", "all_text_baseline"),
        get_value("ensemble_25", "all_text"),
        "",
        "",
        "",
    ]
    row2 = [
        get_value("ensemble_50", "all_text_baseline"),
        get_value("ensemble_50", "all_text"),
        get_value("ensemble_25", "ensemble_50"),
        "",
        "",
    ]
    row3 = [
        get_value("ensemble_75", "all_text_baseline"),
        get_value("ensemble_75", "all_text"),
        get_value("ensemble_25", "ensemble_75"),
        get_value("ensemble_50", "ensemble_75"),
        "",
    ]
    row4 = [
        get_value("stack", "all_text_baseline"),
        get_value("stack", "all_text"),
        get_value("ensemble_25", "stack"),
        get_value("ensemble_50", "stack"),
        get_value("ensemble_75", "stack"),
    ]

    return pd.DataFrame(
        [row0, row1, row2, row3, row4],
        columns=["AT (U)", "AT", "WE .25", "WE .50", "WE .75"],
        index=["AT", "WE .25", "WE .50", "WE .75", "Stack"],
    )

In [41]:
df = get_method_comp_tbl("airbnb", "disbert")

In [42]:
df

Unnamed: 0,AT (U),AT,WE .25,WE .50,WE .75
AT,0.37,,,,
WE .25,0.25,0.41,,,
WE .50,0.24,0.43,0.8,,
WE .75,0.2,0.38,0.58,0.73,
Stack,0.23,0.41,0.75,0.74,0.6


In [43]:
for ds in [
    "prod_sent",
    "kick",
    "jigsaw",
    "wine",
    "fake",
    "imdb_genre",
    "channel",
    "airbnb",
    "salary",
]:
    all_df = pd.concat(
        [
            get_method_comp_tbl(ds, model)
            for model in ["disbert", "bert", "drob", "deberta"]
        ],
        axis=1,
    )
    with open(f"results/tau_method_comp_{ds}.csv", "w") as f:
        all_df.to_csv(f)