In [8]:
from pprint import pprint

import numpy as np
import pandas as pd
import shap
import torch
from datasets import load_dataset
from peft import LoraConfig, TaskType, get_peft_model
from torch import nn
from transformers import (
    AutoConfig,
    AutoTokenizer,
    DataCollatorWithPadding,
    EvalPrediction,
    OPTForSequenceClassification,
    Pipeline,
)

import wandb

MODEL = "facebook/opt-350m"
MAX_POSITION_EMBEDDINGS = 2048

from dataclasses import dataclass

In [9]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
torch.backends.cuda.matmul.allow_tf32 = True
torch.backends.cudnn.allow_tf32 = True

print(device)

cuda


In [10]:
CHECKPOINT_DIR = "OPT-350m-mimic-full"
VAL_DATASET_PATH = "data/val_9.csv"
CODE_PATH = "data/icd9_codes.csv"

In [11]:
# Load dataset
tokenizer = AutoTokenizer.from_pretrained(MODEL, use_fast=True, device=device)

code_labels = pd.read_csv("data/icd9_codes.csv")
dataset = load_dataset("csv", data_files=VAL_DATASET_PATH)

# Create class dictionaries
classes = [class_ for class_ in code_labels["icd_code"] if class_]
class2id = {class_: id for id, class_ in enumerate(classes)}
id2class = {id: class_ for class_, id in class2id.items()}


def multi_labels_to_ids(labels: list[str]) -> list[float]:
    ids = [0.0] * len(class2id)  # BCELoss requires float as target type
    for label in labels:
        ids[class2id[label]] = 1.0
    return ids


def preprocess_function(example):
    result = tokenizer(
        example["text"], truncation=True, max_length=MAX_POSITION_EMBEDDINGS
    )
    result["labels"] = [multi_labels_to_ids(eval(label)) for label in example["labels"]]
    return result


dataset = dataset.map(
    preprocess_function, load_from_cache_file=True, batched=True, num_proc=8
)

In [12]:
config, unused_kwargs = AutoConfig.from_pretrained(
    MODEL,
    num_labels=len(classes),
    id2label=id2class,
    label2id=class2id,
    problem_type="multi_label_classification",
    return_unused_kwargs=True,
)

if unused_kwargs:
    print(f"Unused kwargs: {unused_kwargs}")

model = OPTForSequenceClassification.from_pretrained(
    MODEL,
    config=config,
).to(device)

model.load_adapter(CHECKPOINT_DIR)
model.to_bettertransformer()

Some weights of OPTForSequenceClassification were not initialized from the model checkpoint at facebook/opt-350m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
The BetterTransformer implementation does not support padding during training, as the fused kernels do not support attention masks. Beware that passing padded batched data during training may result in unexpected outputs. Please refer to https://huggingface.co/docs/optimum/bettertransformer/overview for more details.


OPTForSequenceClassification(
  (model): OPTModel(
    (decoder): OPTDecoder(
      (embed_tokens): Embedding(50272, 512, padding_idx=1)
      (embed_positions): OPTLearnedPositionalEmbedding(2050, 1024)
      (project_out): Linear(in_features=1024, out_features=512, bias=False)
      (project_in): Linear(in_features=512, out_features=1024, bias=False)
      (layers): ModuleList(
        (0-23): 24 x OPTDecoderLayer(
          (self_attn): OPTAttentionLayerBetterTransformer(
            (k_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (v_proj): lora.Linear(
              (base_layer): Linear(in_features=1024, out_features=1024, bias=True)
              (lora_dropout): ModuleDict(
                (default): Dropout(p=0.05, inplace=False)
              )
              (lora_A): ModuleDict(
                (default): Linear(in_features=1024, out_features=16, bias=False)
              )
              (lora_B): ModuleDict(
                (default): Linear(in_fea

In [13]:
untokenized_dataset = load_dataset("csv", data_files=VAL_DATASET_PATH)

print(untokenized_dataset["train"][0])

{'text': "Sex:   M\n \nService: SURGERY\n \nAllergies: \nGrass ___, Standard / Lactose\n \n ___.\n \nChief Complaint:\nright popliteal aneurysm\n \nMajor Surgical or Invasive Procedure:\n___: popliteal artery stent graft\n\n \nHistory of Present Illness:\nMr. ___ has a fairly focal aneurysm in the\nmid right popliteal artery and is mostly full of thrombus and\nmeasures 3.1 cm.  It is patent and does have palpable pedal\npulse distally.  He has need of upcoming ankle surgery as well.\nHe has a past medical history notable for breast cancer status\npost mastectomy and chemotherapy/radiation therapy with duodenal\nulcer, pseudogout, depression, hypothyroidism, microvascular\ncerebrovascular disease, hyperlipidemia, and COPD.  He had vein\nmapping performed today which shows the lesser saphenous veins\nto be small and noncompressible bilaterally.  He has the\nthrombosis of the right greater saphenous at the level of the\nknee.  The left greater saphenous is adequate as are both\nbasilic an

In [14]:
inputs = tokenizer(
    untokenized_dataset["train"][0]["text"],
    return_tensors="pt",
    truncation=True,
    max_length=MAX_POSITION_EMBEDDINGS,
).to(device)

with torch.no_grad():
    logits = model(**inputs).logits

In [15]:
class OPT_ICD9_Pipeline(Pipeline):
    def _sanitize_parameters(self, **kwargs):
        preprocess_kwargs = {}
        if "maybe_arg" in kwargs:
            preprocess_kwargs["maybe_arg"] = kwargs["maybe_arg"]
        return preprocess_kwargs, {}, {}

    def preprocess(self, text):
        return self.tokenizer(
            text,
            truncation=True,
            max_length=MAX_POSITION_EMBEDDINGS,
            return_tensors="pt",
        )

    def _forward(self, model_inputs):
        outputs = self.model(**model_inputs)
        return outputs

    def postprocess(self, model_outputs):
        # logits = model_outputs.logits[0].numpy()
        #print(logits)
        probs = model_outputs["logits"].sigmoid()

        output = []
        for i, prob in enumerate(probs[0]):
            label = self.model.config.id2label[i]
            score = prob
            output.append({"label": label, "score": score})
        return output

In [16]:
pipeline = OPT_ICD9_Pipeline(model=model, tokenizer=tokenizer, device=device)

In [17]:
model.device

device(type='cuda', index=0)

In [18]:
pipeline(untokenized_dataset["train"][2]["text"])

[{'label': 'd-2449', 'score': tensor(0.0054)},
 {'label': 'd-25000', 'score': tensor(0.7648)},
 {'label': 'd-2720', 'score': tensor(0.0394)},
 {'label': 'd-2724', 'score': tensor(0.8761)},
 {'label': 'd-2749', 'score': tensor(0.0046)},
 {'label': 'd-2761', 'score': tensor(0.0081)},
 {'label': 'd-2762', 'score': tensor(0.0111)},
 {'label': 'd-27651', 'score': tensor(0.0074)},
 {'label': 'd-27800', 'score': tensor(0.0745)},
 {'label': 'd-2851', 'score': tensor(0.0076)},
 {'label': 'd-2859', 'score': tensor(0.0043)},
 {'label': 'd-2875', 'score': tensor(0.0122)},
 {'label': 'd-30000', 'score': tensor(0.0134)},
 {'label': 'd-30500', 'score': tensor(0.0022)},
 {'label': 'd-3051', 'score': tensor(0.0112)},
 {'label': 'd-311', 'score': tensor(0.0244)},
 {'label': 'd-32723', 'score': tensor(0.1292)},
 {'label': 'd-33829', 'score': tensor(0.0004)},
 {'label': 'd-3572', 'score': tensor(0.1000)},
 {'label': 'd-4019', 'score': tensor(0.9886)},
 {'label': 'd-40390', 'score': tensor(0.0132)},
 {'lab

In [19]:
masker = shap.maskers.Text(pipeline.tokenizer)

In [20]:
sample = shap.sample(untokenized_dataset["train"]["text"], 2)

In [21]:
explainer = shap.Explainer(pipeline, masker)

In [22]:
untokenized_dataset["train"][:2]["text"]

["Sex:   M\n \nService: SURGERY\n \nAllergies: \nGrass ___, Standard / Lactose\n \n ___.\n \nChief Complaint:\nright popliteal aneurysm\n \nMajor Surgical or Invasive Procedure:\n___: popliteal artery stent graft\n\n \nHistory of Present Illness:\nMr. ___ has a fairly focal aneurysm in the\nmid right popliteal artery and is mostly full of thrombus and\nmeasures 3.1 cm.  It is patent and does have palpable pedal\npulse distally.  He has need of upcoming ankle surgery as well.\nHe has a past medical history notable for breast cancer status\npost mastectomy and chemotherapy/radiation therapy with duodenal\nulcer, pseudogout, depression, hypothyroidism, microvascular\ncerebrovascular disease, hyperlipidemia, and COPD.  He had vein\nmapping performed today which shows the lesser saphenous veins\nto be small and noncompressible bilaterally.  He has the\nthrombosis of the right greater saphenous at the level of the\nknee.  The left greater saphenous is adequate as are both\nbasilic and cephal

In [23]:
shap_values = explainer(untokenized_dataset["train"][:5]["text"])

  0%|          | 0/498 [00:00<?, ?it/s]--- Logging error ---
Traceback (most recent call last):
  File "/h/u11/c0/00/ammcourt/miniconda3/envs/csc413/lib/python3.10/logging/__init__.py", line 1100, in emit
    msg = self.format(record)
  File "/h/u11/c0/00/ammcourt/miniconda3/envs/csc413/lib/python3.10/logging/__init__.py", line 943, in format
    return fmt.format(record)
  File "/h/u11/c0/00/ammcourt/miniconda3/envs/csc413/lib/python3.10/logging/__init__.py", line 678, in format
    record.message = record.getMessage()
  File "/h/u11/c0/00/ammcourt/miniconda3/envs/csc413/lib/python3.10/logging/__init__.py", line 368, in getMessage
    msg = msg % self.args
TypeError: not all arguments converted during string formatting
Call stack:
  File "/h/u11/c0/00/ammcourt/miniconda3/envs/csc413/lib/python3.10/runpy.py", line 196, in _run_module_as_main
    return _run_code(code, main_globals, None,
  File "/h/u11/c0/00/ammcourt/miniconda3/envs/csc413/lib/python3.10/runpy.py", line 86, in _run_cod

In [None]:
shap.plots.text(shap_values[0, :, "d-2749"])

In [None]:
out = shap.plots.text(shap_values[0, :, "d-2749"], display = False)
print(out)

<svg width="100%" height="80px"><line x1="0" y1="33" x2="100%" y2="33" style="stroke:rgb(150,150,150);stroke-width:1" /><line x1="51.69812792656875%" y1="33" x2="51.69812792656875%" y2="37" style="stroke:rgb(150,150,150);stroke-width:1" /><text x="51.69812792656875%" y="27" font-size="12px" fill="rgb(120,120,120)" dominant-baseline="bottom" text-anchor="middle">0.2</text><line x1="30.592195595379778%" y1="33" x2="30.592195595379778%" y2="37" style="stroke:rgb(150,150,150);stroke-width:1" /><text x="30.592195595379778%" y="27" font-size="12px" fill="rgb(120,120,120)" dominant-baseline="bottom" text-anchor="middle">0.1</text><line x1="9.486263264190795%" y1="33" x2="9.486263264190795%" y2="37" style="stroke:rgb(150,150,150);stroke-width:1" /><text x="9.486263264190795%" y="27" font-size="12px" fill="rgb(120,120,120)" dominant-baseline="bottom" text-anchor="middle">0</text><line x1="72.80406025775774%" y1="33" x2="72.80406025775774%" y2="37" style="stroke:rgb(150,150,150);stroke-width:1" 

In [50]:
# print(shap_values)

print(explainer.feature_names)
print(len(shap_values.values[1]))
# print(shap_values[0, :, "d-2749"].values)
print(len(shap_values[0, :, "d-2749"].values))
# print(len(shap_values.values[0]))
# print(len(shap_values.base_values[0]))
# print(len(shap_values.data[0]))

# print(shap_values.values[0])
# print(shap_values.base_values[0])
# print(shap_values.data[0])

max_indices = []
for val in shap_values.values[0]:
    print(val)
    inde = np.argmax(val)
    if inde not in max_indices:
        max_indices.append(inde)
    
print(max_indices)

# feature_names = untokenized_dataset["train"].columns
# rf_resultX = pd.DataFrame(shap_values, columns = feature_names)

# vals = np.abs(rf_resultX.values).mean(0)

# shap_importance = pd.DataFrame(list(zip(feature_names, vals)),
#                                   columns=['col_name','feature_importance_vals'])
# shap_importance.sort_values(by=['feature_importance_vals'],
#                                ascending=False, inplace=True)
# shap_importance.head()


# values, clustering = unpack_shap_explanation_contents(v)
#             tokens, values, group_sizes = process_shap_values(v.data, values, grouping_threshold, separator, clustering)


None
2918
2554
[ 2.71846031e-04 -6.47373496e-05 -6.96500435e-05  1.02145861e-04
 -9.35878713e-05 -1.72685996e-05 -4.42805739e-05 -1.46705371e-06
 -1.17421049e-04 -1.03596065e-05 -1.34308368e-04 -7.96414805e-05
  2.38606848e-05 -5.17637333e-05  1.76769763e-04  9.86517510e-04
 -6.10571735e-05  5.72479328e-05 -6.43825680e-06 -1.27825449e-04
 -1.19229087e-04 -4.87761334e-05 -1.50462321e-04 -3.87531047e-04
 -1.94815862e-04 -3.11754541e-04 -9.03045521e-06 -1.87728366e-04
 -3.48447924e-05  1.10823870e-03 -4.56700989e-06 -2.78246290e-05
 -1.66504713e-04  1.53701618e-05 -6.34699862e-05 -9.99308406e-05
 -7.70225056e-05  4.94692541e-05 -3.65434743e-05 -1.00511722e-04
 -1.52033790e-05 -1.24975048e-06  4.74635630e-05 -7.18403494e-05
 -5.02953969e-05 -7.98899391e-05 -7.49551366e-06 -6.30179933e-05
 -4.59664319e-05 -2.60730005e-06 -3.05512390e-05]
[ 2.71846031e-04 -6.47373496e-05 -6.96500435e-05  1.02145861e-04
 -9.35878713e-05 -1.72685996e-05 -4.42805739e-05 -1.46705371e-06
 -1.17421049e-04 -1.03596

Shap pipeline works in the following order:
- Generate the indices array (note this array corresponds to tokens, not strings)
- Tokenize the input dataset
- Pass the tokenized input dataset to the masking functions along with the indices
- The pass all tokenized and masked tokenized datasets to the faithfulness calcuation

This requires a prediction function that expects a tokenized input

In [86]:
def get_index_arrays_shap(inputs, pred_func, model, tokenizer, top_k = 5):
    """ Function to create the arrays corresponding to the shap 
    
    The output is in the format [[input_index_0, input_index_0, ... input_index_n, input_index_n], 
    [rationale_token_index_0 (for input 0), rationale_token_index_1 (for input 0), ... rationale_token_index_k-1 (for input n), rationale_token_index_k (for input n)]]. 
    This is used as an indexing array for masking.
    
    """
    
    # get the shap values over the inputs
    # shap_values = explainer(inputs, batch_size=5)
    
    # get the mode inferences for the inputs
    inferences = pred_func(inputs, model, tokenizer)
    indices_array = None
    # get the longest 
    
    for i, val in enumerate(shap_values):
        # get the choosen labels
        print("Inferences: ", inferences)
        choosen_labels = np.where(inferences[i] > 0.5)
        choosen_labels = np.unique(choosen_labels)
        
        # convert the indices to labels
        choosen_labels = np.array(choosen_labels).astype(int)
        print("Choosen label: ", choosen_labels)
        choosen_labels = [id2class[label] for label in choosen_labels]
        print(id2class)
        print(choosen_labels)
        
        # for each shap value, index in via it's choosen labels
        total_top_k_indices = np.array([])
        top_val = min(top_k, len(choosen_labels))
        print(top_val)
        print(choosen_labels)
        
        
        for label in choosen_labels:
            # get the top k shap value indices
            print(label)
            top_k_indices = np.argsort(shap_values[i, :, label].values)[-top_val:]
            print(top_k_indices)
            total_top_k_indices = np.append(total_top_k_indices, top_k_indices)
        
        # sort the indices array to be in ascending order
        total_top_k_indices = np.sort(total_top_k_indices)
        # remove duplicates
        total_top_k_indices = np.unique(total_top_k_indices)
        # this might be wrong, it seems like shap returns indices outside of the token range
        # so I'm not sure if shap is using the same tokenization function as ours.
        total_top_k_indices = total_top_k_indices[total_top_k_indices < 2048]
        
        # create a array of the same shape of total_top_k_indices and fill with value i
        index_array = np.full(total_top_k_indices.shape, i)
        
        if i == 0:
            indices_array = [index_array.tolist(), total_top_k_indices.tolist()]
        else:
            # append index array to indices array[0]
            indices_array[0] = indices_array[0] + index_array.tolist()
            # append total_top_k_indices to indices array[1]
            indices_array[1] = indices_array[1] + total_top_k_indices.tolist()
    
    return np.array(indices_array).astype(int)

In [93]:
def predictor_model_token(texts, model, tokenizer):
    # print(len(texts))
    tk = tokenizer(texts, return_tensors="pt",truncation=True, padding=True, max_length=MAX_POSITION_EMBEDDINGS).to(device)
    outputs = model(**tk)
    tensor_logits = outputs[0]
    probas = tensor_logits.sigmoid().detach().cpu().numpy()
    return probas

def predictor_model_no_token(texts, model, tokenizer):
    # print(len(texts))
    # tk = tokenizer(texts, return_tensors="pt",truncation=True, padding=True, max_length=MAX_POSITION_EMBEDDINGS).to(device)
    # tokenization is removed but still need to set texts to device
    # i'm not sure why this is a list and don't have time to debug
    texts = texts[0]
    texts.to(device)
    outputs = model(**texts)
    tensor_logits = outputs[0]
    probas = tensor_logits.sigmoid().detach().cpu().numpy()
    return probas

In [94]:
import faithfulness
# this reimports the library for easy testing in the notebook
import importlib
importlib.reload(faithfulness)

MAX_LEN=2048
   
# tokenize the test dataset
test_data = untokenized_dataset["train"][:5]["text"]

inputs = tokenizer(test_data, max_length=MAX_LEN, padding='max_length', truncation=True, return_tensors='pt')
print(inputs)

indices_array = get_index_arrays_shap(test_data, predictor_model_token, model, tokenizer)
print(indices_array)

inputs_rationale_removed = faithfulness.remove_rationale_words(inputs, indices_array, join=False, tokenized=True)
inputs_other_removed = faithfulness.remove_other_words(inputs, indices_array, join=False, tokenized=True)

ind, faith = faithfulness.calculate_faithfulness(inputs, [inputs_rationale_removed], [inputs_other_removed ], model, tokenizer, predictor_model_no_token)

{'input_ids': tensor([[    2, 35581,    35,  ...,  5788,     9,  2934],
        [    2, 35581,    35,  ...,  3082, 25495,    12],
        [    2, 35581,    35,  ..., 31946, 19188, 22870],
        [    2, 35581,    35,  ...,     5,    37,  9244],
        [    2, 35581,    35,  ...,   246,     4,  4573]]), 'attention_mask': tensor([[1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1]])}
Inferences:  [[9.37975466e-01 5.25494153e-03 6.77522659e-01 4.98336613e-01
  6.79258537e-03 2.02010665e-02 3.65591724e-03 2.85266694e-02
  9.22198687e-03 1.40093759e-01 3.15162800e-02 2.56435410e-03
  2.00353339e-02 3.89346667e-03 1.25673488e-01 6.48736656e-01
  2.29764599e-02 6.66631246e-03 1.15575073e-02 8.88499469e-02
  1.89710241e-02 6.58680052e-02 5.89114707e-03 9.85623375e-02
  5.20462496e-03 9.10434872e-03 8.12007338e-02 2.80272570e-02
  1.78236177e-03 9.06653941e-01 3.23412903e-02 1.00508835

AttributeError: 'list' object has no attribute 'to'