In [54]:
import shap
from datasets import load_dataset
from src.utils import MODEL_NAME_TO_DESC_DICT, format_text_pred, prepare_text
import matplotlib.pyplot as plt
import numpy as np
from transformers import AutoModelForSequenceClassification, pipeline, AutoTokenizer
import pandas as pd
from datasets import load_dataset, DatasetDict, Dataset
from transformers.pipelines.pt_utils import KeyDataset
from tqdm import tqdm

import lightgbm as lgb
from xgboost import XGBClassifier

In [10]:
test_df = load_dataset('james-burton/imdb_genre_prediction2', split='test')
tab_cols = ['Year','Runtime (Minutes)', 'Rating', 'Votes', 'Revenue (Millions)','Metascore', 'Rank']
text_col = ['Description']

test_df_text = prepare_text(test_df, 'text_col_only')
test_df_tab = test_df.to_pandas()[tab_cols]

train_df = load_dataset('james-burton/imdb_genre_prediction2', split='train').to_pandas()
train_df_tab = train_df[tab_cols]
y_train = train_df['Genre_is_Drama']


Found cached dataset parquet (/home/james/.cache/huggingface/datasets/james-burton___parquet/james-burton--imdb_genre_prediction2-a5449428d75bcc31/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)
Found cached dataset parquet (/home/james/.cache/huggingface/datasets/james-burton___parquet/james-burton--imdb_genre_prediction2-a5449428d75bcc31/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)


## Text preds

In [41]:
text_model = AutoModelForSequenceClassification.from_pretrained(
    "james-burton/imdb_genre_9", num_labels=2
)
# text_model = torch.compile(text_model)
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
text_pipeline = pipeline('text-classification', model=text_model, tokenizer=tokenizer, device="cuda:0")

def text_pred_fn(examples):
    dataset = Dataset.from_dict({'text': examples})
    # put the dataset on the GPU
    
    preds = [out for out in text_pipeline(KeyDataset(dataset, "text"), batch_size=64)]
    preds = np.array([format_text_pred(pred) for pred in preds])
    return preds

text_explainer = shap.Explainer(text_pred_fn, tokenizer)
# text_explainer = shap.Explainer(f, tokenizer)
text_shap_values = text_explainer(test_df_text[:], fixed_context=1, batch_size=64)
# text base values are different for each prediction

You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset
Partition explainer: 201it [00:16,  4.81it/s]


In [38]:
np.concatenate([text_pred_fn(np.expand_dims(x,0)) for x in test_df_text[:]]).mean(axis=0)

You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


array([0.39784127, 0.60215873])

## Tab preds

In [102]:
tab_model = lgb.LGBMClassifier(random_state=42)
# tab_model = XGBClassifier(random_state=42)
tab_model.fit(train_df_tab,y_train)

def tab_pred_fn(examples):
    preds = tab_model.predict_proba(examples)
    return preds

tab_explainer = shap.KernelExplainer(tab_pred_fn, train_df_tab)
tab_shap_values = tab_explainer.shap_values(test_df_tab[:1])
# tab_explainer_tree = shap.TreeExplainer(tab_model)
# tab_shap_values_tree = tab_explainer_tree.shap_values(test_df_tab)

Using 680 background data samples could cause slower run times. Consider using shap.sample(data, K) or shap.kmeans(data, K) to summarize the background as K samples.
100%|██████████| 1/1 [00:00<00:00,  3.58it/s]


We have to use kernel explainer exps bc the tree explainer uses log odds, which I don't think can be converted to probablities.

Also I can confirm that the expected values are the mean values across the entire dataset

In [70]:
tab_explainer.expected_value

array([0.48825361, 0.51174639])

## Ensemble

In [15]:
class Model():
    def __init__(self, text_to_pred_dict=None):
        self.text_to_pred_dict = text_to_pred_dict
        self.text_pred_len = 0
        
    def predict_both(self, examples, text_weight=0.5, load_from_cache=True):
        tab_examples = examples[:,:-1]
        tab_preds = tab_model.predict_proba(tab_examples)
        text_examples = examples[:,-1]
        
        desc_dict = {}
        for i, desc in tqdm(enumerate(text_examples)):
            if desc not in desc_dict:
                desc_dict[desc] = [i]
            else:
                desc_dict[desc].append(i)
        
        if load_from_cache:
            text_preds = np.array([self.text_to_pred_dict[desc] for desc in desc_dict.keys()])    
            
        else:
            text_preds = text_pipeline(list(desc_dict.keys()))
            text_preds = np.array([format_text_pred(pred) for pred in text_preds])
                            
        
        expanded_text_preds = np.zeros((len(text_examples), 2))
        for i, (desc, idxs) in enumerate(desc_dict.items()):
            expanded_text_preds[idxs] = text_preds[i]
        
        # Combine the predictions, multiplying the text and predictions by 0.5
        preds = text_weight * expanded_text_preds + (1-text_weight) * tab_preds
        return preds



Making things quicker by pre-running the preds

In [13]:
X_test_train = pd.concat([train_df[tab_cols + text_col], test_df.to_pandas()[tab_cols + text_col]])
text_preds = text_pipeline(list(X_test_train['Description']))


text_preds = np.array([format_text_pred(pred) for pred in text_preds])
            
text_to_pred_dict = {desc: preds for desc, preds in zip(list(X_test_train['Description']), text_preds)}

You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


In [16]:
ensemble_model = Model(text_to_pred_dict)

ensemble_explainer = shap.KernelExplainer(lambda x: ensemble_model.predict_both(x,text_weight=0.5), train_df[tab_cols + text_col])

680it [00:00, 1977896.48it/s]
Using 680 background data samples could cause slower run times. Consider using shap.sample(data, K) or shap.kmeans(data, K) to summarize the background as K samples.


In [107]:
text_shap_values

.values =
array([array([[ 0.02220371, -0.02220371],
              [-0.01506445,  0.01506445],
              [-0.03303704,  0.03303704],
              [-0.0311462 ,  0.0311462 ],
              [-0.02803044,  0.02803044],
              [ 0.05327172, -0.05327172],
              [-0.03996469,  0.03996469],
              [-0.00634225,  0.00634225],
              [-0.00302226,  0.00302226],
              [-0.03223676,  0.03223676],
              [-0.00229048,  0.00229048],
              [ 0.00648356, -0.00648356],
              [-0.00063192,  0.00063192],
              [ 0.00716726, -0.00716726],
              [ 0.00332985, -0.00332985],
              [-0.0017228 ,  0.0017228 ],
              [-0.00660889,  0.00660889],
              [-0.03208152,  0.03208152],
              [-0.0366188 ,  0.0366188 ],
              [-0.00219238,  0.00219238],
              [-0.00551338,  0.00551338],
              [-0.00705618,  0.00705618],
              [-0.03040338,  0.03040338],
              [-0.003842

In [32]:
ensemble_model.predict_both(x,text_weight=0.5)

1it [00:00, 14413.42it/s]


array([[0.10932201, 0.89067799]])

In [83]:
np.moveaxis(np.array(tab_shap_values), [0,1,2], [2,0,1])[0]

array([[ 0.02378674, -0.02378674],
       [ 0.07710546, -0.07710546],
       [-0.21282796,  0.21282796],
       [ 0.14288573, -0.14288573],
       [-0.09192256,  0.09192256],
       [-0.11714832,  0.11714832],
       [-0.08913887,  0.08913887]])

In [84]:
tab_explainer.expected_value

array([0.48825361, 0.51174639])

In [108]:
test_df_tab.iloc[0]

np.array([' Features'] + [f' {col}: {val}' for col, val in zip(tab_cols, test_df_tab.iloc[0])])

array(['\n', ' Year: 2009.0', ' Runtime (Minutes): 95.0', ' Rating: 7.7',
       ' Votes: 398972.0', ' Revenue (Millions): 32.39',
       ' Metascore: 76.0', ' Rank: 508.0'], dtype='<U26')

In [141]:
new_shap_val0 = text_shap_values[0]
# np.concatenate([new_shap_val0.values, np.array([[0,0]]), np.moveaxis(np.array(tab_shap_values), [0,1,2], [2,0,1])[0]]).shape
# new_shap_val0.values = np.concatenate([0.5*new_shap_val0.values, 0.5*np.moveaxis(np.array(tab_shap_values), [0,1,2], [2,0,1])[0]])
new_shap_val0.values = np.concatenate([0.5*new_shap_val0.values, np.array([[0,0]]),0.5*np.moveaxis(np.array(tab_shap_values), [0,1,2], [2,0,1])[0]])
new_shap_val0.base_values = 0.5*new_shap_val0.base_values + 0.5*tab_explainer.expected_value
# new_shap_val0.data = np.concatenate([new_shap_val0.data, np.array([f'{col}: {val}' for col, val in zip(tab_cols, test_df_tab.iloc[0])])])
new_shap_val0.data = np.concatenate([new_shap_val0.data, np.array(['___________Features____________'] + [f' {col}: {val}' for col, val in zip(tab_cols, test_df_tab.iloc[0])])])
new_shap_val0.hierarchical_values = None
shap.plots.text(new_shap_val0)

In [74]:
shap.plots.text(text_shap_values[0])