In [38]:
import numpy as np
import shap
import pickle
from datasets import load_dataset

# from src.dataset_info import get_dataset_info
from transformers import pipeline, AutoTokenizer
import os
from tqdm import tqdm
from src.utils import token_segments, text_ft_index_ends, format_text_pred

# from src.models import Model
import xgboost as xgb
import lightgbm as lgb
from src.models import WeightedEnsemble, StackModel, AllAsTextModel
from src.joint_masker import JointMasker
from src.utils import ConfigLoader
import argparse
import scipy as sp


parser = argparse.ArgumentParser()
parser.add_argument(
    "--config",
    type=str,
    default="vet_50b_all_text",
    help="Name of config from the the multi_config.yaml file",
)


def run_shap(
    config_type,
    max_samples=100,
    test_set_size=100,
):
    # Shap args
    args = ConfigLoader(
        config_type, "../configs/shap_configs.yaml", "../configs/dataset_default.yaml"
    )
    # Data
    all_text_versions = [
        "all_text",
        "all_as_text",
        "all_as_text_base_reorder",
        "all_as_text_tnt_reorder",
    ]
    ds_name = args.ds_name
    train_df = load_dataset(
        ds_name,
        split="train",  # download_mode="force_redownload"
    ).to_pandas()
    y_train = train_df[args.label_col]

    test_df = load_dataset(
        ds_name,
        split="test",  # download_mode="force_redownload"
    ).to_pandas()
    test_df = test_df.sample(test_set_size, random_state=55)

    # Models
    tokenizer = AutoTokenizer.from_pretrained(
        args.text_model_base, model_max_length=512
    )
    if args.model_type in all_text_versions:
        text_pipeline = pipeline(
            "text-classification",
            model=args.my_text_model,
            tokenizer=tokenizer,
            device="cuda:0",
            truncation=True,
            padding=True,
            top_k=None,
        )
        # Define how to convert all columns to a single string
        if args.model_type in ["all_as_text", "all_text"]:
            cols_to_str_fn = lambda array: " | ".join(
                [
                    f"{col}: {val}"
                    for col, val in zip(
                        args.categorical_cols + args.numerical_cols + args.text_cols,
                        array,
                    )
                ]
            )
        else:
            # # Reorder based on the new index order in di
            # cols_to_str_fn = lambda array: " | ".join(
            #     [
            #         f"{col}: {val}"
            #         for _, col, val in sorted(
            #             zip(args.new_idx_order, args.tab_cols + args.text_cols, array)
            #         )
            #     ]
            # )
            raise NotImplementedError(
                "Shouldn't need much as the column ordering is in dataset info,\
                just need to update the cols_to_str_fn"
            )

        model = AllAsTextModel(
            text_pipeline=text_pipeline,
            cols_to_str_fn=cols_to_str_fn,
        )
    else:
        text_pipeline = pipeline(
            "text-classification",
            model=args.my_text_model,
            tokenizer=tokenizer,
            device="cuda:0",
            truncation=True,
            padding=True,
            top_k=None,
        )
        # Define how to convert the text columns to a single string
        if len(args.text_cols) == 1:

            def cols_to_str_fn(array):
                return array[0]

        else:

            def cols_to_str_fn(array):
                return " | ".join(
                    [f"{col}: {val}" for col, val in zip(args.text_cols, array)]
                )

        # LightGBM requires explicitly marking categorical features
        train_df[args.categorical_cols] = train_df[args.categorical_cols].astype(
            "category"
        )
        test_df[args.categorical_cols] = test_df[args.categorical_cols].astype(
            "category"
        )

        tab_model = lgb.LGBMClassifier(random_state=42, max_depth=3)
        tab_model.fit(train_df[args.categorical_cols + args.numerical_cols], y_train)

        if args.model_type in ["ensemble_25", "ensemble_50", "ensemble_75"]:
            text_weight = float(args.model_type.split("_")[-1]) / 100
            model = WeightedEnsemble(
                tab_model=tab_model,
                text_pipeline=text_pipeline,
                text_weight=text_weight,
                cols_to_str_fn=cols_to_str_fn,
            )
        elif args.model_type == "stack":
            """
            For the stack model, we make predictions on the validation set. These predictions
            are then used as features for the stack model (another LightGBM model) along with
            the other tabular features. In doing so the stack model learns, depending on the
            tabular features, when to trust the tabular model and when to trust the text model.
            """
            val_df = load_dataset(
                ds_name,
                split="validation",  # download_mode="force_redownload"
            ).to_pandas()
            val_df[args.categorical_cols] = val_df[args.categorical_cols].astype(
                "category"
            )
            y_val = val_df[args.label_col]
            val_text = list(map(cols_to_str_fn, val_df[args.text_cols].values))

            # Training set is the preditions from the tabular and text models on the validation set
            # plus the tabular features from the validation set
            text_val_preds = text_pipeline(val_text)
            text_val_preds = np.array(
                [format_text_pred(pred) for pred in text_val_preds]
            )
            # text_val_preds = np.array(
            #     [[lab["score"] for lab in pred] for pred in text_val_preds]
            # )

            # add text and tabular predictions to the val_df
            stack_val_df = val_df[args.categorical_cols + args.numerical_cols]
            tab_val_preds = tab_model.predict_proba(
                val_df[args.categorical_cols + args.numerical_cols]
            )
            stack_val_df[f"text_pred"] = text_val_preds[:, 1]
            stack_val_df[f"tab_pred"] = tab_val_preds[:, 1]

            stack_model = lgb.LGBMClassifier(
                random_state=42, max_depth=2, learning_rate=0.01
            )
            stack_model.fit(stack_val_df, y_val)

            model = StackModel(
                tab_model=tab_model,
                text_pipeline=text_pipeline,
                stack_model=stack_model,
                cols_to_str_fn=cols_to_str_fn,
            )
        else:
            raise ValueError(f"Invalid model type of {args.model_type}")

    np.random.seed(1)
    x = test_df[args.categorical_cols + args.numerical_cols + args.text_cols].values

    # We need to load the ordinal dataset so that we can calculate the correlations for
    # the masker
    ord_train_df = load_dataset(args.ord_ds_name, split="train").to_pandas()

    # Clustering only valid if there is more than one column
    if len(args.categorical_cols + args.numerical_cols) > 1:
        tab_pt = sp.cluster.hierarchy.complete(
            sp.spatial.distance.pdist(
                ord_train_df[args.categorical_cols + args.numerical_cols]
                .fillna(
                    ord_train_df[args.categorical_cols + args.numerical_cols].median()
                )
                .values.T,
                metric="correlation",
            )
        )
    else:
        tab_pt = None

    masker = JointMasker(
        tab_df=train_df[args.categorical_cols + args.numerical_cols],
        text_cols=args.text_cols,
        cols_to_str_fn=cols_to_str_fn,
        tokenizer=tokenizer,
        collapse_mask_token=True,
        max_samples=max_samples,
        tab_partition_tree=tab_pt,
    )

    explainer = shap.explainers.Partition(model=model.predict, masker=masker)
    shap_vals = explainer(x)

    # output_dir = "models/shap_vals/"
    # print(f"Results will be saved @: {output_dir}")

    # # Make output directory
    # if not os.path.exists(output_dir):
    #     os.makedirs(output_dir)
    # with open(os.path.join(output_dir, f"{config_type}.pkl"), "wb") as f:
    #     pickle.dump(shap_vals, f)

    return shap_vals

In [39]:
config_type = "vet_50c_all_text"

run_shap(config_type, test_set_size=1000)
# gen_summary_shap_vals(config_type)

Updating with:
{'config': 'vet_50c_all_text', 'my_text_model': 'james-burton/vet_50c', 'ds_name': 'james-burton/vet_month_1c_all_text', 'text_model_base': 'SAVSNET/PetBERT_pretrained', 'model_type': 'all_text', 'ord_ds_name': 'james-burton/vet_month_1c_ordinal', 'text_cols': ['breed', 'region', 'record']}


{'categorical_cols': ['gender', 'neutered', 'species', 'insured'], 'numerical_cols_long': ['age_at_consult', 'Diseases of the ear or mastoid process', 'Mental, behavioural or neurodevelopmental disorders', 'Diseases of the blood or blood-forming organs', 'Diseases of the circulatory system', 'Dental', 'Developmental anomalies', 'Diseases of the digestive system', 'Endocrine, nutritional or metabolic diseases', 'Diseases of the Immune system', 'Certain infectious or parasitic diseases', 'Diseases of the skin', 'Diseases of the musculoskeletal system or connective tissue', 'Neoplasms', 'Diseases of the nervous system', 'Diseases of the visual system', 'Certain conditions originating i

  0%|          | 0/498 [00:00<?, ?it/s]

KeyboardInterrupt: 

## Single example, all text

In [40]:
args = ConfigLoader(
    config_type, "../configs/shap_configs.yaml", "../configs/dataset_default.yaml"
)

ds_name = args.ds_name
train_df = load_dataset(
    ds_name,
    split="train",  # download_mode="force_redownload"
).to_pandas()
y_train = train_df[args.label_col]

test_df = load_dataset(
    ds_name,
    split="test",  # download_mode="force_redownload"
).to_pandas()
# test_df = test_df.sample(test_set_size, random_state=55)

# Models
tokenizer = AutoTokenizer.from_pretrained(args.text_model_base, model_max_length=512)

text_pipeline = pipeline(
    "text-classification",
    model=args.my_text_model,
    tokenizer=tokenizer,
    device="cuda:0",
    truncation=True,
    padding=True,
    top_k=None,
)
cols_to_str_fn = lambda array: " | ".join(
    [
        f"{col}: {val}"
        for col, val in zip(
            args.categorical_cols + args.numerical_cols + args.text_cols,
            array,
        )
    ]
)
model = AllAsTextModel(
    text_pipeline=text_pipeline,
    cols_to_str_fn=cols_to_str_fn,
)


# We need to load the ordinal dataset so that we can calculate the correlations for
# the masker
ord_train_df = load_dataset(args.ord_ds_name, split="train").to_pandas()

# Clustering only valid if there is more than one column
if len(args.categorical_cols + args.numerical_cols) > 1:
    tab_pt = sp.cluster.hierarchy.complete(
        sp.spatial.distance.pdist(
            ord_train_df[args.categorical_cols + args.numerical_cols]
            .fillna(ord_train_df[args.categorical_cols + args.numerical_cols].median())
            .values.T,
            metric="correlation",
        )
    )
else:
    tab_pt = None

masker = JointMasker(
    tab_df=train_df[args.categorical_cols + args.numerical_cols],
    text_cols=args.text_cols,
    cols_to_str_fn=cols_to_str_fn,
    tokenizer=tokenizer,
    collapse_mask_token=True,
    max_samples=100,
    tab_partition_tree=tab_pt,
)

explainer = shap.explainers.Partition(model=model.predict, masker=masker)

Updating with:
{'config': 'vet_50c_all_text', 'my_text_model': 'james-burton/vet_50c', 'ds_name': 'james-burton/vet_month_1c_all_text', 'text_model_base': 'SAVSNET/PetBERT_pretrained', 'model_type': 'all_text', 'ord_ds_name': 'james-burton/vet_month_1c_ordinal', 'text_cols': ['breed', 'region', 'record']}


{'categorical_cols': ['gender', 'neutered', 'species', 'insured'], 'numerical_cols_long': ['age_at_consult', 'Diseases of the ear or mastoid process', 'Mental, behavioural or neurodevelopmental disorders', 'Diseases of the blood or blood-forming organs', 'Diseases of the circulatory system', 'Dental', 'Developmental anomalies', 'Diseases of the digestive system', 'Endocrine, nutritional or metabolic diseases', 'Diseases of the Immune system', 'Certain infectious or parasitic diseases', 'Diseases of the skin', 'Diseases of the musculoskeletal system or connective tissue', 'Neoplasms', 'Diseases of the nervous system', 'Diseases of the visual system', 'Certain conditions originating i

In [43]:
np.random.seed(1)
x = test_df[args.categorical_cols + args.numerical_cols + args.text_cols].values[:1]
shap_vals = explainer(x)

You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset
You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


  0%|          | 0/498 [00:00<?, ?it/s]

You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset
You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset
You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset
You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset
You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset
You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset
You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset
You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset
You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset
You seem to be usin

In [42]:
x.shape

(2184, 28)

In [44]:
shap_vals

.values =
array([[[ 2.43847685e-03, -2.43847750e-03],
        [ 2.34704246e-03, -2.34704753e-03],
        [ 1.41339569e-05, -1.41350232e-05],
        [ 1.41339569e-05, -1.41350232e-05],
        [ 1.93553087e-02, -1.93553121e-02],
        [ 1.41339569e-05, -1.41350232e-05],
        [ 1.41339569e-05, -1.41350232e-05],
        [ 1.03923977e-04, -1.03925954e-04],
        [ 1.03923977e-04, -1.03925954e-04],
        [ 1.03923977e-04, -1.03925954e-04],
        [ 1.41339569e-05, -1.41350232e-05],
        [ 1.03923977e-04, -1.03925954e-04],
        [ 1.03923977e-04, -1.03925954e-04],
        [ 1.41339569e-05, -1.41350232e-05],
        [ 1.41339569e-05, -1.41350232e-05],
        [ 1.41339569e-05, -1.41350232e-05],
        [ 1.41339569e-05, -1.41350232e-05],
        [ 1.41339569e-05, -1.41350232e-05],
        [ 1.41339569e-05, -1.41350232e-05],
        [ 1.41339569e-05, -1.41350232e-05],
        [ 1.41339569e-05, -1.41350232e-05],
        [ 1.41339569e-05, -1.41350232e-05],
        [ 1.41339569e-

In [45]:
config_type = "vet_10c_all_text"
args = ConfigLoader(
    config_type, "../configs/shap_configs.yaml", "../configs/dataset_default.yaml"
)

ds_name = args.ds_name
train_df = load_dataset(
    ds_name,
    split="train",  # download_mode="force_redownload"
).to_pandas()
y_train = train_df[args.label_col]

test_df = load_dataset(
    ds_name,
    split="test",  # download_mode="force_redownload"
).to_pandas()
# test_df = test_df.sample(test_set_size, random_state=55)

# Models
tokenizer = AutoTokenizer.from_pretrained(args.text_model_base, model_max_length=512)

text_pipeline = pipeline(
    "text-classification",
    model=args.my_text_model,
    tokenizer=tokenizer,
    device="cuda:0",
    truncation=True,
    padding=True,
    top_k=None,
)
cols_to_str_fn = lambda array: " | ".join(
    [
        f"{col}: {val}"
        for col, val in zip(
            args.categorical_cols + args.numerical_cols + args.text_cols,
            array,
        )
    ]
)
model = AllAsTextModel(
    text_pipeline=text_pipeline,
    cols_to_str_fn=cols_to_str_fn,
)


# We need to load the ordinal dataset so that we can calculate the correlations for
# the masker
ord_train_df = load_dataset(args.ord_ds_name, split="train").to_pandas()

# Clustering only valid if there is more than one column
if len(args.categorical_cols + args.numerical_cols) > 1:
    tab_pt = sp.cluster.hierarchy.complete(
        sp.spatial.distance.pdist(
            ord_train_df[args.categorical_cols + args.numerical_cols]
            .fillna(ord_train_df[args.categorical_cols + args.numerical_cols].median())
            .values.T,
            metric="correlation",
        )
    )
else:
    tab_pt = None

masker = JointMasker(
    tab_df=train_df[args.categorical_cols + args.numerical_cols],
    text_cols=args.text_cols,
    cols_to_str_fn=cols_to_str_fn,
    tokenizer=tokenizer,
    collapse_mask_token=True,
    max_samples=100,
    tab_partition_tree=tab_pt,
)

explainer = shap.explainers.Partition(model=model.predict, masker=masker)

Updating with:
{'config': 'vet_10c_all_text', 'my_text_model': 'james-burton/vet_10c', 'ds_name': 'james-burton/vet_month_1c_all_text', 'text_model_base': 'bert-base-uncased', 'model_type': 'all_text', 'ord_ds_name': 'james-burton/vet_month_1c_ordinal', 'text_cols': ['breed', 'region', 'record']}


{'categorical_cols': ['gender', 'neutered', 'species', 'insured'], 'numerical_cols_long': ['age_at_consult', 'Diseases of the ear or mastoid process', 'Mental, behavioural or neurodevelopmental disorders', 'Diseases of the blood or blood-forming organs', 'Diseases of the circulatory system', 'Dental', 'Developmental anomalies', 'Diseases of the digestive system', 'Endocrine, nutritional or metabolic diseases', 'Diseases of the Immune system', 'Certain infectious or parasitic diseases', 'Diseases of the skin', 'Diseases of the musculoskeletal system or connective tissue', 'Neoplasms', 'Diseases of the nervous system', 'Diseases of the visual system', 'Certain conditions originating in the per

In [46]:
np.random.seed(1)
x2 = test_df[args.categorical_cols +
             args.numerical_cols + args.text_cols].values[:1]
shap_vals2 = explainer(x2)

  0%|          | 0/498 [00:00<?, ?it/s]

You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset
You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset
You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset
You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset
You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset
You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset
You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset
You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset
You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset
You seem to be usin

In [47]:
shap_vals2

.values =
array([[[ 2.61602763e-05, -2.61596560e-05],
        [ 4.66747676e-05, -4.66751065e-05],
        [ 2.61602763e-05, -2.61596560e-05],
        [ 2.61602763e-05, -2.61596560e-05],
        [ 2.61602763e-05, -2.61596560e-05],
        [ 2.61602763e-05, -2.61596560e-05],
        [ 2.61602763e-05, -2.61596560e-05],
        [ 4.66747676e-05, -4.66751065e-05],
        [ 4.66747676e-05, -4.66751065e-05],
        [ 4.66747676e-05, -4.66751065e-05],
        [ 2.61602763e-05, -2.61596560e-05],
        [ 4.66747676e-05, -4.66751065e-05],
        [ 4.66747676e-05, -4.66751065e-05],
        [ 2.61602763e-05, -2.61596560e-05],
        [ 2.61602763e-05, -2.61596560e-05],
        [ 2.61602763e-05, -2.61596560e-05],
        [ 2.61602763e-05, -2.61596560e-05],
        [ 2.61602763e-05, -2.61596560e-05],
        [ 2.61602763e-05, -2.61596560e-05],
        [ 2.61602763e-05, -2.61596560e-05],
        [ 2.61602763e-05, -2.61596560e-05],
        [ 2.61602763e-05, -2.61596560e-05],
        [ 2.61602763e-

In [48]:
import pickle
from transformers import AutoTokenizer
import numpy as np
import shap
from src.plot_text import text, get_grouped_vals

# from src.utils import format_fts_for_plotting, text_ft_index_ends, token_segments
# from src.utils import legacy_get_dataset_info
from datasets import load_dataset

# import matplotlib.pyplot as plt
from src.run_shap import load_shap_vals
from tqdm import tqdm

# import re
# import seaborn as sns
# from scipy import stats
from src.utils import (
    ConfigLoader,
    text_ft_index_ends,
    token_segments,
    format_fts_for_plotting,
    format_text_fts_too,
)

In [50]:
idx = 0
sv = shap_vals2

tokenizer = AutoTokenizer.from_pretrained(args.text_model_base)
text_idxs = text_ft_index_ends(
    text_fts=sv.data[idx][len(args.categorical_cols + args.numerical_cols) :],
    tokenizer=tokenizer,
)
linebreak_before_idxs = [len(args.categorical_cols + args.numerical_cols)] + [
    x + len(args.categorical_cols + args.numerical_cols) + 1 for x in text_idxs
]

formatted_data = np.array(
    format_fts_for_plotting(
        sv[idx].feature_names,
        sv[idx].data[: len(args.categorical_cols + args.numerical_cols)],
    )
)

formatted_data = format_text_fts_too(
    formatted_data,
    linebreak_before_idxs,
    args.text_cols,
)
shap.text_plot(
    shap.Explanation(
        values=sv[idx].values,
        base_values=sv[idx].base_values,
        data=formatted_data,
        clustering=sv[idx].clustering,
        output_names=args.label_names,
        hierarchical_values=sv[idx].hierarchical_values,
    ),
    # grouping_threshold=20,
    grouping_threshold=0.1,
)