In [3]:
import pickle
from transformers import AutoTokenizer
import numpy as np
import shap
from src.plot_text import text, get_grouped_vals

# from src.utils import format_fts_for_plotting, text_ft_index_ends, token_segments
# from src.utils import legacy_get_dataset_info
from datasets import load_dataset

# import matplotlib.pyplot as plt
from src.run_shap import load_shap_vals
from tqdm import tqdm
from scipy import stats

import pandas as pd
from src.utils import (
    ConfigLoader,
    text_ft_index_ends,
    token_segments,
    format_fts_for_plotting,
    format_text_fts_too,
)

## Kendalls tau

In [2]:
add_parent_dir = True

bert_cfgs = [
    "vet_10c_all_text",
    "vet_19c_ensemble_25",
    "vet_19c_ensemble_50",
    "vet_19c_ensemble_75",
    "vet_19c_stack",
]

petbert_cfgs = [
    "vet_50c_all_text",
    "vet_59c_ensemble_25",
    "vet_59c_ensemble_50",
    "vet_59c_ensemble_75",
    "vet_59c_stack",
]

results = {}
tab_scale_factor = 1
rows = []

for cfg_set, mdl_name in zip([bert_cfgs, petbert_cfgs], ["BERT", "PetBERT"]):
    vals = []
    for config_name in cfg_set:
        pre = "../" if add_parent_dir else ""
        args = ConfigLoader(
            config_name,
            f"{pre}configs/shap_configs.yaml",
            f"{pre}configs/dataset_default.yaml",
        )
        if "baseline" in config_name:
            with open(
                f"{pre}models/shap_vals/summed_{config_name}_col_names.pkl", "rb"
            ) as f:
                grouped_col_name_shap_vals = pickle.load(f)
            with open(
                f"{pre}models/shap_vals/summed_{config_name}_colons.pkl", "rb"
            ) as f:
                grouped_colon_shap_vals = pickle.load(f)
            with open(f"{pre}models/shap_vals/summed_{config_name}.pkl", "rb") as f:
                grouped_shap_vals = pickle.load(f)
            val_only = (
                grouped_shap_vals - grouped_col_name_shap_vals - grouped_colon_shap_vals
            )
            grouped_template = np.mean(
                grouped_shap_vals - val_only, axis=2, keepdims=True
            )

            cols = [
                f"(Tab) {col}" for col in args.categorical_cols + args.numerical_cols
            ] + [f"(Text) {col}" for col in args.text_cols]
            abs_ft = np.sum(np.abs(grouped_shap_vals), axis=0)
            vals.append(abs_ft)
        else:
            with open(f"{pre}models/shap_vals/summed_{config_name}.pkl", "rb") as f:
                grouped_shap_vals = pickle.load(f)
            abs_ft = np.sum(np.abs(grouped_shap_vals), axis=0)
            cols = [
                f"(Tab) {col}" for col in args.categorical_cols + args.numerical_cols
            ] + [f"(Text) {col}" for col in args.text_cols]
            vals.append(abs_ft)

    rows.append([mdl_name, vals])

pre_tau_df = pd.DataFrame(
    rows,
    columns=["text_model", "vals"],
)

Updating with:
{'config': 'vet_10c_all_text', 'my_text_model': 'james-burton/vet_10c', 'ds_name': 'james-burton/vet_month_1c_all_text', 'text_model_base': 'bert-base-uncased', 'model_type': 'all_text', 'ord_ds_name': 'james-burton/vet_month_1c_ordinal', 'text_cols': ['breed', 'region', 'record']}


{'categorical_cols': ['gender', 'neutered', 'species', 'insured'], 'numerical_cols_long': ['age_at_consult', 'Diseases of the ear or mastoid process', 'Mental, behavioural or neurodevelopmental disorders', 'Diseases of the blood or blood-forming organs', 'Diseases of the circulatory system', 'Dental', 'Developmental anomalies', 'Diseases of the digestive system', 'Endocrine, nutritional or metabolic diseases', 'Diseases of the Immune system', 'Certain infectious or parasitic diseases', 'Diseases of the skin', 'Diseases of the musculoskeletal system or connective tissue', 'Neoplasms', 'Diseases of the nervous system', 'Diseases of the visual system', 'Certain conditions originating in the per

## Split by text model

In [22]:
methods = [
    "all_text",
    "ensemble_25",
    "ensemble_50",
    "ensemble_75",
    "stack",
]

all_summary_dfs = []
for row in pre_tau_df.itertuples():
    method_comparions = []
    for i, method in enumerate(methods):
        for j, method2 in enumerate(methods):
            if i < j:
                taus = []
                p_values = []
                vals = np.stack(row.vals, axis=2)
                for instance in vals:
                    tau, p_value = stats.kendalltau(
                        instance[:, i], instance[:, j])
                    taus.append(tau)
                    p_values.append(p_value)
                method_comparions.append(
                    [
                        row.text_model,
                        method + " vs " + method2,
                        np.mean(taus),
                        np.std(taus),
                        taus,
                        p_values,
                    ]
                )
    tau_df = pd.DataFrame(
        method_comparions,
        columns=[
            "text_model",
            "methodA_vs_methodB",
            "mean_tau",
            "std_tau",
            "taus",
            "p_values",
        ],
    )
    # tau_df.to_csv("tau_df.csv", index=False)
    all_comparsions = {}
    for row in tau_df.itertuples():
        if row.methodA_vs_methodB not in all_comparsions:
            all_comparsions[row.methodA_vs_methodB] = []
        all_comparsions[row.methodA_vs_methodB].extend(row.taus)
    all_comparsions_summary = {}
    for key, val in all_comparsions.items():
        all_comparsions_summary[key] = [np.mean(val), np.std(val), len(val)]

    summary_df = pd.DataFrame(all_comparsions_summary).T
    summary_df.columns = ["mean_tau", "std_tau", "n"]
    summary_df["mean (std)"] = summary_df.apply(
        lambda row: f"{row.mean_tau:.2f} ({row.std_tau:.2f})", axis=1
    )
    all_summary_dfs.append(summary_df)

## Together

In [55]:
method_comparions = []
for row in pre_tau_df.itertuples():
    for i, method in enumerate(methods):
        for j, method2 in enumerate(methods):
            if i < j:
                taus = []
                p_values = []
                vals = np.stack(row.vals, axis=2)
                for instance in vals:
                    tau, p_value = stats.kendalltau(
                        instance[:, i], instance[:, j])
                    taus.append(tau)
                    p_values.append(p_value)
                method_comparions.append(
                    [
                        row.text_model,
                        method + " vs " + method2,
                        np.mean(taus),
                        np.std(taus),
                        taus,
                        p_values,
                    ]
                )
tau_df = pd.DataFrame(
    method_comparions,
    columns=[
        "text_model",
        "methodA_vs_methodB",
        "mean_tau",
        "std_tau",
        "taus",
        "p_values",
    ],
)
tau_df.to_csv("tau_df.csv", index=False)
all_comparsions = {}
for row in tau_df.itertuples():
    if row.methodA_vs_methodB not in all_comparsions:
        all_comparsions[row.methodA_vs_methodB] = []
    all_comparsions[row.methodA_vs_methodB].extend(row.taus)
all_comparsions_summary = {}
for key, val in all_comparsions.items():
    all_comparsions_summary[key] = [np.mean(val), np.std(val), len(val)]

summary_df = pd.DataFrame(all_comparsions_summary).T
summary_df.columns = ["mean_tau", "std_tau", "n"]
summary_df["mean (std)"] = summary_df.apply(
    lambda row: f"{row.mean_tau:.2f} ({row.std_tau:.2f})", axis=1
)

In [64]:
summary_df.to_csv("tau_df_method_joined.csv", index=True)

In [61]:
all_summary_dfs[0]

Unnamed: 0,mean_tau,std_tau,n,mean (std)
all_text vs ensemble_25,0.117015,0.2,1000.0,0.12 (0.20)
all_text vs ensemble_50,0.212741,0.239721,1000.0,0.21 (0.24)
all_text vs ensemble_75,0.262833,0.299493,1000.0,0.26 (0.30)
all_text vs stack,0.184113,0.246951,1000.0,0.18 (0.25)
ensemble_25 vs ensemble_50,0.679578,0.147932,1000.0,0.68 (0.15)
ensemble_25 vs ensemble_75,0.470726,0.187322,1000.0,0.47 (0.19)
ensemble_25 vs stack,0.551386,0.154537,1000.0,0.55 (0.15)
ensemble_50 vs ensemble_75,0.656382,0.188266,1000.0,0.66 (0.19)
ensemble_50 vs stack,0.611873,0.167302,1000.0,0.61 (0.17)
ensemble_75 vs stack,0.534924,0.215555,1000.0,0.53 (0.22)


In [62]:
all_summary_dfs[0].to_csv("tau_df_method_bert.csv", index=True)

In [63]:
all_summary_dfs[1].to_csv("tau_df_method_petbert.csv", index=True)

## Kendalls tau, split by text models

In [31]:
results = {}
tab_scale_factor = 1
rows = []
for cfg_pair, method in zip(zip(bert_cfgs, petbert_cfgs), methods):
    vals = []
    pre = "../" if add_parent_dir else ""
    for config_name in cfg_pair:
        args = ConfigLoader(
            config_name,
            f"{pre}configs/shap_configs.yaml",
            f"{pre}configs/dataset_default.yaml",
        )
        if "baseline" in config_name:
            with open(
                f"{pre}models/shap_vals/summed_{config_name}_col_names.pkl", "rb"
            ) as f:
                grouped_col_name_shap_vals = pickle.load(f)
            with open(
                f"{pre}models/shap_vals/summed_{config_name}_colons.pkl", "rb"
            ) as f:
                grouped_colon_shap_vals = pickle.load(f)
            with open(f"{pre}models/shap_vals/summed_{config_name}.pkl", "rb") as f:
                grouped_shap_vals = pickle.load(f)
            val_only = (
                grouped_shap_vals - grouped_col_name_shap_vals - grouped_colon_shap_vals
            )
            grouped_template = np.mean(
                grouped_shap_vals - val_only, axis=2, keepdims=True
            )

            cols = [
                f"(Tab) {col}" for col in args.categorical_cols + args.numerical_cols
            ] + [f"(Text) {col}" for col in args.text_cols]
            abs_ft = np.sum(np.abs(grouped_shap_vals), axis=0)
            vals.append(abs_ft)
        else:
            with open(f"{pre}models/shap_vals/summed_{config_name}.pkl", "rb") as f:
                grouped_shap_vals = pickle.load(f)
            abs_ft = np.sum(np.abs(grouped_shap_vals), axis=0)
            cols = [
                f"(Tab) {col}" for col in args.categorical_cols + args.numerical_cols
            ] + [f"(Text) {col}" for col in args.text_cols]
            vals.append(abs_ft)
    rows.append([method, vals])

pre_tau_df2 = pd.DataFrame(
    rows,
    columns=["method", "vals"],
)
# unranked_df_no_template.to_csv("unranked_df_no_template.csv", index=False)

Updating with:
{'config': 'vet_10c_all_text', 'my_text_model': 'james-burton/vet_10c', 'ds_name': 'james-burton/vet_month_1c_all_text', 'text_model_base': 'bert-base-uncased', 'model_type': 'all_text', 'ord_ds_name': 'james-burton/vet_month_1c_ordinal', 'text_cols': ['breed', 'region', 'record']}


{'categorical_cols': ['gender', 'neutered', 'species', 'insured'], 'numerical_cols_long': ['age_at_consult', 'Diseases of the ear or mastoid process', 'Mental, behavioural or neurodevelopmental disorders', 'Diseases of the blood or blood-forming organs', 'Diseases of the circulatory system', 'Dental', 'Developmental anomalies', 'Diseases of the digestive system', 'Endocrine, nutritional or metabolic diseases', 'Diseases of the Immune system', 'Certain infectious or parasitic diseases', 'Diseases of the skin', 'Diseases of the musculoskeletal system or connective tissue', 'Neoplasms', 'Diseases of the nervous system', 'Diseases of the visual system', 'Certain conditions originating in the per

## Split by text model

In [48]:
text_models = ["BERT", "PetBERT"]
model_comparions = []
for row in pre_tau_df2.itertuples():
    for i, model in enumerate(text_models):
        for j, model2 in enumerate(text_models):
            if i < j:
                taus = []
                p_values = []
                vals = np.stack(row.vals, axis=2)
                for instance in vals:
                    tau, p_value = stats.kendalltau(
                        instance[:, i], instance[:, j])
                    taus.append(tau)
                    p_values.append(p_value)
                model_comparions.append(
                    [
                        row.method,
                        model + " vs " + model2,
                        np.mean(taus),
                        np.std(taus),
                        f"{np.mean(taus):.2f} ({np.std(taus):.2f})",
                        taus,
                        p_values,
                    ]
                )
tau_df2 = pd.DataFrame(
    model_comparions,
    columns=[
        "method",
        "modelA_vs_modelB",
        "mean_tau",
        "std_tau",
        "mean (std)",
        "taus",
        "p_values",
    ],
)
tau_df2.to_csv("tau_df_model_split.csv", index=False)

## Unsplit

In [54]:
all_comparsions = {}
for row in tau_df2.itertuples():
    if row.modelA_vs_modelB not in all_comparsions:
        all_comparsions[row.modelA_vs_modelB] = []
    all_comparsions[row.modelA_vs_modelB].extend(row.taus)

all_comparsions_summary = {}
for key, val in all_comparsions.items():
    all_comparsions_summary[key] = [np.mean(val), np.std(val), len(val)]
summary_df = pd.DataFrame(all_comparsions_summary).T
summary_df.columns = ["mean_tau", "std_tau", "n"]
summary_df["mean (std)"] = summary_df.apply(
    lambda row: f"{row.mean_tau:.2f} ({row.std_tau:.2f})", axis=1
)
summary_df.to_csv("tau_df_model_joined.csv", index=False)

In [65]:
tau_df2[["method", "modelA_vs_modelB", "mean_tau", "std_tau", "mean (std)"]]

Unnamed: 0,method,modelA_vs_modelB,mean_tau,std_tau,mean (std)
0,all_text,BERT vs PetBERT,0.330901,0.440177,0.33 (0.44)
1,ensemble_25,BERT vs PetBERT,0.755866,0.137823,0.76 (0.14)
2,ensemble_50,BERT vs PetBERT,0.716576,0.192515,0.72 (0.19)
3,ensemble_75,BERT vs PetBERT,0.750693,0.20395,0.75 (0.20)
4,stack,BERT vs PetBERT,0.599408,0.206088,0.60 (0.21)


In [68]:
tau_df2[['method', 'modelA_vs_modelB', 'mean_tau', 'std_tau',
         'mean (std)']].to_csv("tau_df_model_split.csv", index=True)

In [51]:
pre_tau_df2

Unnamed: 0,method,vals
0,all_text,"[[[1.1860912210411024e-05, 2.6704728177615042e..."
1,ensemble_25,"[[[0.03342106347588829, 0.0017214304963799124,..."
2,ensemble_50,"[[[0.022280708983925535, 0.001147620330919967,..."
3,ensemble_75,"[[[0.011140354491962812, 0.0005738101654599384..."
4,stack,"[[[0.01450382771362562, 0.0003132956704172804,..."


In [52]:
all_comparsions = {}
for row in tau_df2.itertuples():
    if row.modelA_vs_modelB not in all_comparsions:
        all_comparsions[row.modelA_vs_modelB] = []
    all_comparsions[row.modelA_vs_modelB].extend(row.taus)

all_comparsions_summary = {}
for key, val in all_comparsions.items():
    all_comparsions_summary[key] = [np.mean(val), np.std(val), len(val)]
summary_df = pd.DataFrame(all_comparsions_summary).T
summary_df.columns = ["mean_tau", "std_tau", "n"]
summary_df["mean (std)"] = summary_df.apply(
    lambda row: f"{row.mean_tau:.2f} ({row.std_tau:.2f})", axis=1
)

In [53]:
summary_df

Unnamed: 0,mean_tau,std_tau,n,mean (std)
BERT vs PetBERT,0.630689,0.304032,5000.0,0.63 (0.30)
