In [1]:
import pickle
from src.dataset_info import get_dataset_info
from datasets import load_dataset
import numpy as np
from sklearn.linear_model import LinearRegression
from transformers import (
    AutoModelForSequenceClassification,
    AutoTokenizer,
)
from src.utils import prepare_text
from datasets import Dataset
from transformers import Trainer


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
text_model_code = "disbert"
model_base = "distilbert-base-uncased"

test_set_size = 100


# Including the column name and column

Here we see that the baseline explanations, which here include the column name and colon, do better than the ones I have generated. I think this is because there are more features to work with because firstly, each tabular feature is treated like a word, which it is not, and secondly, the column name is included, which is not the case in my explanations.

## Tab Scale Factor: 1

## Predicting the ground truth for the test set

In [3]:
for ds_name in ["fake", "jigsaw", "kick", "imdb_genre"]:
    di = get_dataset_info(ds_name, model_type="all_text")

    test_df = load_dataset(
        di.ds_name,
        split="test",  # download_mode="force_redownload"
    ).to_pandas()
    test_df = test_df.sample(100, random_state=55)
    for filepath in [
        f"../models/shap_vals_{text_model_code}_sf1/{ds_name}/summed_shap_vals_all_text.pkl",
        f"../models/shap_vals_{text_model_code}_sf1/{ds_name}/summed_shap_vals_all_text_baseline.pkl",
    ]:
        with open(filepath, "rb") as f:
            grouped_shap_vals = pickle.load(f)

        reg = LinearRegression().fit(
            grouped_shap_vals[1], test_df[[di.label_col]])
        print(reg.score(grouped_shap_vals[1], test_df[[di.label_col]]))

for ds_name in ["wine", "prod_sent"]:
    di = get_dataset_info(ds_name, model_type="all_text")

    test_df = load_dataset(
        di.ds_name,
        split="test",  # download_mode="force_redownload"
    ).to_pandas()
    test_df = test_df.sample(100, random_state=55)
    for filepath in [
        f"../models/shap_vals_{text_model_code}_sf1/{ds_name}/summed_shap_vals_all_text.pkl",
        f"../models/shap_vals_{text_model_code}_sf1/{ds_name}/summed_shap_vals_all_text_baseline.pkl",
    ]:
        with open(filepath, "rb") as f:
            grouped_shap_vals = pickle.load(f)

        reg = LinearRegression().fit(
            grouped_shap_vals.reshape(grouped_shap_vals.shape[1], -1),
            test_df[[di.label_col]],
        )
        print(
            reg.score(
                grouped_shap_vals.reshape(grouped_shap_vals.shape[1], -1),
                test_df[[di.label_col]],
            )
        )

Using dataset fake, all as text version


Found cached dataset parquet (/home/james/.cache/huggingface/datasets/james-burton___parquet/james-burton--fake_job_postings2_all_text-b16faba3acce3185/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)


0.8531106514378886
0.8745459869124683
Using dataset jigsaw, all as text version


Found cached dataset parquet (/home/james/.cache/huggingface/datasets/james-burton___parquet/james-burton--jigsaw_unintended_bias100K_all_text-351e9b1e029b8621/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)


0.5939530422736701
0.7429309125367136
Using dataset kick, all as text version


Found cached dataset parquet (/home/james/.cache/huggingface/datasets/james-burton___parquet/james-burton--kick_starter_funding_all_text-359fb6748cb2726a/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)


0.2235314271576363
0.28446298065749664
Using dataset imdb_genre, all as text version


Found cached dataset parquet (/home/james/.cache/huggingface/datasets/james-burton___parquet/james-burton--imdb_genre_prediction_all_text-e7768922c61ebd55/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)


0.26745295212849873
0.29149016668689953
Using dataset wine, all as text version


Found cached dataset parquet (/home/james/.cache/huggingface/datasets/james-burton___parquet/james-burton--wine_reviews_all_text-b647b2691a49354e/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)


1.0
1.0
Using dataset prod_sent, all as text version


Found cached dataset parquet (/home/james/.cache/huggingface/datasets/james-burton___parquet/james-burton--product_sentiment_machine_hack_all_text-64fff8d5159768bc/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)


0.04068397999283868
0.06197952833804021


## Predict model predictions from explanations

In [4]:
pred_dict = {}

for ds_name in [
    "jigsaw",
    "kick",
    "wine",
    "imdb_genre",
    "prod_sent",
    "fake",
]:
    print(ds_name)
    di = get_dataset_info(ds_name, model_type="all_text")

    test_df = load_dataset(
        di.ds_name,
        split="test",  # download_mode="force_redownload"
    ).to_pandas()
    test_df = test_df.sample(100, random_state=55)[
        di.tab_cols + di.text_cols + [di.label_col]
    ]

    test_ds = Dataset.from_pandas(test_df)

    dataset = prepare_text(test_ds, "all_as_text", ds_name)

    # Load model and tokenizer
    model = AutoModelForSequenceClassification.from_pretrained(
        di.text_model_name, num_labels=di.num_labels, problem_type=di.prob_type
    )
    tokenizer = AutoTokenizer.from_pretrained(model_base)

    # Tokenize the dataset

    def encode(examples):
        return {
            "labels": np.array([examples[di.label_col]]),
            **tokenizer(examples["text"], truncation=True, padding="max_length"),
        }

    dataset = dataset.map(encode, load_from_cache_file=True)
    trainer = Trainer(model=model)
    preds = trainer.predict(dataset).predictions
    labels = trainer.predict(dataset).label_ids

    pred_dict[ds_name] = np.argmax(preds, axis=1).reshape(-1, 1)

jigsaw
Using dataset jigsaw, all as text version


Found cached dataset parquet (/home/james/.cache/huggingface/datasets/james-burton___parquet/james-burton--jigsaw_unintended_bias100K_all_text-351e9b1e029b8621/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)
                                                   

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


The following columns in the test set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: wow, other_religion, likes, homosexual_gay_or_lesbian, asian, comment_text, heterosexual, text, other_gender, other_race_or_ethnicity, atheist, __index_level_0__, sad, other_sexual_orientation, funny, psychiatric_or_mental_illness, buddhist, female, target, disagree, black, other_disability, physical_disability, muslim, male, jewish, bisexual, intellectual_or_learning_disability, hindu, latino, christian, transgender, white. If wow, other_religion, likes, homosexual_gay_or_lesbian, asian, comment_text, heterosexual, text, other_gender, other_race_or_ethnicity, atheist, __index_level_0__, sad, other_sexual_orientation, funny, psychiatric_or_mental_illness, buddhist, female, target, disagree, black, other_disability, physical_disability, muslim, male, jewish, bisexual, intellectual_or_learning_disability, hindu, latino, christian, transgender, 

kick
Using dataset kick, all as text version


Found cached dataset parquet (/home/james/.cache/huggingface/datasets/james-burton___parquet/james-burton--kick_starter_funding_all_text-359fb6748cb2726a/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)
loading configuration file config.json from cache at /home/james/.cache/huggingface/hub/models--james-burton--kick_0/snapshots/56c93fe3c282bc41851064e7c2e796410e237414/config.json
Model config DistilBertConfig {
  "_name_or_path": "james-burton/kick_0",
  "activation": "gelu",
  "architectures": [
    "DistilBertForSequenceClassification"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "initializer_range": 0.02,
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "pad_token_id": 0,
  "problem_type": "single_label_classification",
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weights_": true,
  "torch_dtype": "float32",
  "transforme

wine
Using dataset wine, all as text version


Found cached dataset parquet (/home/james/.cache/huggingface/datasets/james-burton___parquet/james-burton--wine_reviews_all_text-b647b2691a49354e/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)
loading configuration file config.json from cache at /home/james/.cache/huggingface/hub/models--james-burton--wine_0/snapshots/63514546b7921ebe5c99f62a91dcaefec8b454d5/config.json
Model config DistilBertConfig {
  "_name_or_path": "james-burton/wine_0",
  "activation": "gelu",
  "architectures": [
    "DistilBertForSequenceClassification"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3",
    "4": "LABEL_4",
    "5": "LABEL_5",
    "6": "LABEL_6",
    "7": "LABEL_7",
    "8": "LABEL_8",
    "9": "LABEL_9",
    "10": "LABEL_10",
    "11": "LABEL_11",
    "12": "LABEL_12",
    "13": "LABEL_13",
    "14": "LABEL_14",
    "15": "LABEL_15",
    "1

imdb_genre
Using dataset imdb_genre, all as text version


Found cached dataset parquet (/home/james/.cache/huggingface/datasets/james-burton___parquet/james-burton--imdb_genre_prediction_all_text-e7768922c61ebd55/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)
loading configuration file config.json from cache at /home/james/.cache/huggingface/hub/models--james-burton--imdb_genre_0/snapshots/295d3fd4442d00c046709522b7afd86b694bcfd6/config.json
Model config DistilBertConfig {
  "_name_or_path": "james-burton/imdb_genre_0",
  "activation": "gelu",
  "architectures": [
    "DistilBertForSequenceClassification"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "initializer_range": 0.02,
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "pad_token_id": 0,
  "problem_type": "single_label_classification",
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weights_": true,
  "torch_dtype": "float32",


prod_sent
Using dataset prod_sent, all as text version


Found cached dataset parquet (/home/james/.cache/huggingface/datasets/james-burton___parquet/james-burton--product_sentiment_machine_hack_all_text-64fff8d5159768bc/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)
loading configuration file config.json from cache at /home/james/.cache/huggingface/hub/models--james-burton--prod_sent_0/snapshots/791b25ae571148b0e1d16eab008f13d10caa91fc/config.json
Model config DistilBertConfig {
  "_name_or_path": "james-burton/prod_sent_0",
  "activation": "gelu",
  "architectures": [
    "DistilBertForSequenceClassification"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3"
  },
  "initializer_range": 0.02,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2,
    "LABEL_3": 3
  },
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "pad_

fake
Using dataset fake, all as text version


Found cached dataset parquet (/home/james/.cache/huggingface/datasets/james-burton___parquet/james-burton--fake_job_postings2_all_text-b16faba3acce3185/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)
loading configuration file config.json from cache at /home/james/.cache/huggingface/hub/models--james-burton--fake_0/snapshots/7bd58f15adbc883a1fe57687d5afc75d6802971d/config.json
Model config DistilBertConfig {
  "_name_or_path": "james-burton/fake_0",
  "activation": "gelu",
  "architectures": [
    "DistilBertForSequenceClassification"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "initializer_range": 0.02,
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "pad_token_id": 0,
  "problem_type": "single_label_classification",
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weights_": true,
  "torch_dtype": "float32",
  "transformers

In [5]:
names = ["Text and Tabular", "Baseline"]

for ds_name in ["fake", "jigsaw", "kick", "imdb_genre"]:  #
    di = get_dataset_info(ds_name, model_type="all_text")

    test_df = load_dataset(
        di.ds_name,
        split="test",  # download_mode="force_redownload"
    ).to_pandas()
    test_df = test_df.sample(100, random_state=55)[
        di.tab_cols + di.text_cols + [di.label_col]
    ]
    for filepath, name in zip(
        [
            f"../models/shap_vals_{text_model_code}_sf1/{ds_name}/summed_shap_vals_all_text.pkl",
            f"../models/shap_vals_{text_model_code}_sf1/{ds_name}/summed_shap_vals_all_text_baseline.pkl",
        ],
        names,
    ):
        with open(filepath, "rb") as f:
            grouped_shap_vals = pickle.load(f)

        reg = LinearRegression().fit(grouped_shap_vals[1], pred_dict[ds_name])
        print(name, " model_pred")
        print(reg.score(grouped_shap_vals[1], pred_dict[ds_name]))
    print("")
for ds_name in ["wine", "prod_sent"]:  #
    di = get_dataset_info(ds_name, model_type="all_text")

    test_df = load_dataset(
        di.ds_name,
        split="test",  # download_mode="force_redownload"
    ).to_pandas()
    test_df = test_df.sample(100, random_state=55)[
        di.tab_cols + di.text_cols + [di.label_col]
    ]
    for filepath, name in zip(
        [
            f"../models/shap_vals_{text_model_code}_sf1/{ds_name}/summed_shap_vals_all_text.pkl",
            f"../models/shap_vals_{text_model_code}_sf1/{ds_name}/summed_shap_vals_all_text_baseline.pkl",
        ],
        names,
    ):
        with open(filepath, "rb") as f:
            grouped_shap_vals = pickle.load(f)

        reg = LinearRegression().fit(
            grouped_shap_vals.reshape(grouped_shap_vals.shape[1], -1),
            pred_dict[ds_name],
        )
        print(name, " model_pred")
        print(
            reg.score(
                grouped_shap_vals.reshape(grouped_shap_vals.shape[1], -1),
                pred_dict[ds_name],
            )
        )
    print("")

    # print(reg.coef_)

    # print(reg.intercept_)


Using dataset fake, all as text version


Found cached dataset parquet (/home/james/.cache/huggingface/datasets/james-burton___parquet/james-burton--fake_job_postings2_all_text-b16faba3acce3185/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)


Text and Tabular  model_pred
0.9236128377480808
Baseline  model_pred
0.9382520652199182

Using dataset jigsaw, all as text version


Found cached dataset parquet (/home/james/.cache/huggingface/datasets/james-burton___parquet/james-burton--jigsaw_unintended_bias100K_all_text-351e9b1e029b8621/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)


Text and Tabular  model_pred
0.9814101700277078
Baseline  model_pred
0.9955902793929611

Using dataset kick, all as text version


Found cached dataset parquet (/home/james/.cache/huggingface/datasets/james-burton___parquet/james-burton--kick_starter_funding_all_text-359fb6748cb2726a/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)


Text and Tabular  model_pred
0.672687736009064
Baseline  model_pred
0.6571649612910976

Using dataset imdb_genre, all as text version


Found cached dataset parquet (/home/james/.cache/huggingface/datasets/james-burton___parquet/james-burton--imdb_genre_prediction_all_text-e7768922c61ebd55/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)


Text and Tabular  model_pred
0.8201985436204173
Baseline  model_pred
0.850080007442346

Using dataset wine, all as text version


Found cached dataset parquet (/home/james/.cache/huggingface/datasets/james-burton___parquet/james-burton--wine_reviews_all_text-b647b2691a49354e/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)


Text and Tabular  model_pred
1.0
Baseline  model_pred
1.0

Using dataset prod_sent, all as text version


Found cached dataset parquet (/home/james/.cache/huggingface/datasets/james-burton___parquet/james-burton--product_sentiment_machine_hack_all_text-64fff8d5159768bc/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)


Text and Tabular  model_pred
0.057229953181074156
Baseline  model_pred
0.08334524342976246



# Not summing

We can't not sum because each instance has a different number of shap values and therefore we cannot train a model to predict the ground truth from the explanations.