#### Implementation of Comprehensiveness

### Evaluating Faithfulness on our model:

In [2]:
import argparse
from pprint import pprint

import evaluate
import numpy as np
import pandas as pd
import torch
from torch import nn
from datasets import load_dataset
from peft import LoraConfig, TaskType, get_peft_model
from transformers import (
    AutoConfig,
    AutoTokenizer,
    AutoModelForSequenceClassification,
    DataCollatorWithPadding,
    EvalPrediction,
    OPTForSequenceClassification,
    Trainer,
    TrainingArguments,
)
import wandb

MODEL = "facebook/opt-350m"
MAX_POSITION_EMBEDDINGS = 2048

from dataclasses import dataclass


  from .autonotebook import tqdm as notebook_tqdm


In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
torch.backends.cuda.matmul.allow_tf32 = True
torch.backends.cudnn.allow_tf32 = True

In [4]:
CHECKPOINT_DIR = "OPT-350m-mimic-full"
VAL_DATASET_PATH = "data/val_9.csv"
CODE_PATH = "data/icd9_codes.csv"

In [5]:
# Load dataset
tokenizer = AutoTokenizer.from_pretrained(MODEL, use_fast=True, device=device)

code_labels = pd.read_csv("data/icd9_codes.csv")
dataset = load_dataset("csv", data_files=VAL_DATASET_PATH)

# Create class dictionaries
classes = [class_ for class_ in code_labels["icd_code"] if class_]
class2id = {class_: id for id, class_ in enumerate(classes)}
id2class = {id: class_ for class_, id in class2id.items()}


def multi_labels_to_ids(labels: list[str]) -> list[float]:
    ids = [0.0] * len(class2id)  # BCELoss requires float as target type
    for label in labels:
        ids[class2id[label]] = 1.0
    return ids


def preprocess_function(example):
    result = tokenizer(
        example["text"], truncation=True, max_length=MAX_POSITION_EMBEDDINGS
    )
    result["labels"] = [multi_labels_to_ids(eval(label)) for label in example["labels"]]
    return result


# dataset = dataset.map(
#     preprocess_function, load_from_cache_file=True, batched=True, num_proc=8
# )

In [6]:
config, unused_kwargs = AutoConfig.from_pretrained(
    MODEL,
    num_labels=len(classes),
    id2label=id2class,
    label2id=class2id,
    problem_type="multi_label_classification",
    return_unused_kwargs=True,
)

if unused_kwargs:
    print(f"Unused kwargs: {unused_kwargs}")

model = OPTForSequenceClassification.from_pretrained(
    MODEL,
    config=config,
).to(device)

Some weights of OPTForSequenceClassification were not initialized from the model checkpoint at facebook/opt-350m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [7]:
model.load_adapter(CHECKPOINT_DIR)

In [None]:
untokenized_dataset = load_dataset("csv", data_files=VAL_DATASET_PATH)

print(untokenized_dataset['train'][0])

In [45]:
inputs = tokenizer(untokenized_dataset["train"][0]['text'], return_tensors="pt", truncation=True, max_length=MAX_POSITION_EMBEDDINGS).to(device)

with torch.no_grad():
    logits = model(**inputs).logits

In [None]:
logits = logits.to('cpu')

predicted_class_ids = torch.arange(0, logits.shape[-1])[
    torch.sigmoid(logits).squeeze(dim=0) > 0.5
]

# Get the predicted class names
for id in predicted_class_ids:
    predicted_class = id2class[int(id)]
    pprint(code_labels[code_labels.icd_code == predicted_class])

In [None]:
for i, logit in enumerate(logits[0]):
    pprint(f'{classes[i]}, {logit}')

In [9]:
import lime
from lime import lime_text
from lime.lime_text import LimeTextExplainer
from lime.lime_text import IndexedString
import numpy as np
import torch.nn.functional as F
from time import time


explainer = LimeTextExplainer(class_names=classes, bow=False)

def predictor_opt(texts):
    tk = tokenizer(texts, return_tensors="pt",truncation=True, max_length=MAX_POSITION_EMBEDDINGS).to(device)
    outputs = model(**tk)
    tensor_logits = outputs[0]
    probas = F.sigmoid(tensor_logits).detach().cpu().numpy()
    return probas

# used by the faithfulness function
def predictor_model(texts, model, tokenizer):
    tk = tokenizer(texts, return_tensors="pt",truncation=True, max_length=MAX_POSITION_EMBEDDINGS).to(device)
    outputs = model(**tk)
    tensor_logits = outputs[0]
    probas = F.sigmoid(tensor_logits).detach().cpu().numpy()
    return probas

In [None]:
sentence = untokenized_dataset["train"][2]["text"]
n_samples = 10
k = 5
print(len(sentence))
print(sentence)

In [1]:
torch.cuda.empty_cache()
with torch.no_grad():
    exp_bert = explainer.explain_instance(
        sentence, predictor_opt, num_samples=n_samples, top_labels=k
    )

NameError: name 'torch' is not defined

In [None]:
exp_bert.show_in_notebook(text=True)

In [17]:
untokenized_dataset["train"][2]['labels']

"['d-42731', 'd-V5861', 'd-42789', 'd-25000', 'd-4019', 'd-2724', 'd-53081']"

In [11]:
import lime.explanation
import lime.lime_text
import numpy as np


def lime_create_index_arrays(instances, pred_fn, explainer, n_samples=10, k_labels=5):
    """get the explanation for the given instances and generate index arrays for the rationale"""
    indexed_strs = np.array([])
    # get the index of the longest instance
    longest_instance = max(instances, key=len)
    longest_instance = lime.lime_text.IndexedString(longest_instance)
    padding_len = len(longest_instance.as_np)
    
    
    index_array = None
    for i, instance in enumerate(instances):
        indexed_str = lime.lime_text.IndexedString(instance)
        torch.cuda.empty_cache()
        with torch.no_grad():
            exp = explainer.explain_instance(instances[0], pred_fn, num_samples=n_samples, top_labels=k_labels)
        
        
        # create masked array from map
        exp_map = exp.as_map()
        # print(exp_map)
        for label in exp_map.keys():
            for item in exp_map[label]:
                if index_array is None:
                    index_array = np.array([[i, item[0]]])
                else:
                    # append to the index array so that np.take can be used to mask the data
                    index_array = np.append(index_array, [[i, item[0]]], axis=0)
                    #print(index_array)
        
        # pad and save
        str_as_np = indexed_str.as_np
        padding = np.full((padding_len - len(str_as_np)), [''], dtype=str)
        str_as_np = np.append(str_as_np, padding)
        
        if indexed_strs.size == 0:
            # pad indexed_str
            indexed_strs = np.array([str_as_np])
        else:
            indexed_strs = np.append(indexed_strs, [str_as_np], axis=0)
        
    index_array_x = np.transpose(index_array)[0]
    index_array_y = np.transpose(index_array)[1]
    index_array = np.array([index_array_x, index_array_y])
    
    return indexed_strs, index_array



The instances are formatted as a list of strings, where each string is one word used by lime. The rationales mask is a list of indices, where the first list refers to the index of the sample the label corresponds to and the second list is the index of string used in that rationale.

In [27]:
# preprocessing code for comprehensiveness test
# generates the versions of the instances with rationale words removed, and the versions with all non rationale words removed
import copy
import numpy as np

def remove_rationale_words(instances, rationales):
    inst_rationale_removed = copy.deepcopy(instances)
    
    rationales_mask = np.zeros(instances.shape, dtype=bool)
    
    # set the values of the rational mask to true based on rationales in a vectorized manner
    # the rationales are in the format [[instance_index_1, instance_index_2, ...], [word_index_1, word_index_2, ...]]
    rationales_mask[rationales[0], rationales[1]] = True
    
    print(rationales_mask)
    
    # remove the rationale words from the instance in a vectorized manner. The rationale words are a mask, w
    # do this for every instance at the same time using numpy, this is faster than looping through each instance. do not use a list comprehension here
    inst_rationale_removed = np.where(rationales_mask, " ", instances)
    return inst_rationale_removed
    
def remove_other_words(instances, rationales):
    inst_other_removed = copy.deepcopy(instances)
    
    # create version of index array where all indexes are added that are not in the rationalle
    inverse_rationales_mask = np.ones(instances.shape, dtype=bool)
    inverse_rationales_mask[rationales[0], rationales[1]] = False
    
    # remove the rationale words from the instance in a vectorized manner
    # do this for every instance at the same time using numpy, this is faster than looping through each instance. do not use a list comprehension here
    # replace each word with "" so that the length of the instance stays the same
    inst_other_removed = np.where(inverse_rationales_mask, " ", instances)
    return inst_other_removed


In [23]:
import numpy as np

def calculate_comprehensiveness(predictions, instances_rationale_removed, model, tokenizer):
    """ Calculate the comprehensiveness of the rationales

    Args:
        predictions (np.array(np.array(float))): List of predictions made with the base instances (no words removed) using the given model.
        instances_rationale_removed (np.array(np.array(word))): List of rationales to compute the comprehensiveness for. This is formatted as a list of numpy arrays, where each array is an array of words.
        model (model): The model to compute the comprehensiveness for.
    """
    print("Calculating Comprehensiveness")
    
    # pass the instances through the model - get the predictions
    torch.cuda.empty_cache()
    predictions_rationale_removed = predictor_model(instances_rationale_removed, model, tokenizer)
    print("Predictions ratonale removed: ", predictions_rationale_removed)
    
    # calculate the euclidean distance between the probability of the predicted class and sum over multi labels
    # logits are the classification scores for the opt model
    # confidence_dif = predictions.logits - predictions_rationale_removed.logits
    confidence_dif = predictions - predictions_rationale_removed
    print("Confidence Dif: ", confidence_dif)
    confidence_dif = np.linalg.norm(confidence_dif, axis=-1)
    print("Confidence Dif - eudclidean distance: ", confidence_dif)
    
    # return the average confidence difference over the samples
    return np.mean(confidence_dif, axis=-1)
    


#### Implementation of Sufficency

In [24]:
def calculate_sufficency(predictions, instances_other_removed, model, tokenizer):
    """Calculates the sufficiency of the rationales

    Args:
        predictions (np.array(np.array(float))): List of predictions made with the base instances (no words removed) using the given model.
        instances_rationale_removed (np.array(np.array(indices))): List of rationales to compute the sufficency for. This is formatted as a list of numpy arrays, where each array acts as a mask, where a 1 indicates that the word is a rationale word.
        model (model): The model to compute the sufficency for.
    """
    print("Calculating Sufficiency")
    torch.cuda.empty_cache()
    predictions_other_removed = predictor_model(instances_other_removed, model, tokenizer)
    print("Predicitons other removed: ", predictions_other_removed)
    
    # calculate the euclidean distance between the predictions and the predictions with the other words removed
    # logits are the classification scores
    # confidence_dif = predictions.logits - predictions_other_removed.logits
    confidence_dif = predictions - predictions_other_removed
    print("Confidence Dif: ", confidence_dif)
    confidence_dif = np.linalg.norm(confidence_dif, axis=-1)
    print("Confidence Dif - eudclidean distance: ", confidence_dif)
    
    # return the average confidence difference
    return np.mean(confidence_dif, axis=-1)
    
    

#### Implementation of Faithfullness 

In [25]:
def calculate_faithfulness(instances, instances_rationalle_removed, instances_other_removed, model, tokenizer):
    """Calculate the faithfulness of the rationales

    Args:
        instances (numpy(numpy(string))): List of instances to compute the faithfulness for. This is formatted as a list of numpy arrays of words.
        instances_rationalle_removed (numpy(numpy(numpy(int)))): List of rationales to compute the faithfulness for. This is formatted as a list of numpy arrays, where each array acts as a mask, where a 1 indicates that the word is a rationale word. Each list is provided by one interpretability method.
        instances_other_removed (numpy(numpy(int))): List of instances with all non rationale words removed to compute the faithfulness for. This is formatted as a list of numpy arrays, where each array acts as a mask, where a 1 indicates that the word is not a rationale word. Each list is provided by one interpretability method.
        model (model): The model to compute the faithfulness for.
    """
    # generate predictions
    predictions = predictor_model(instances, model, tokenizer)
    faithfulness_calc = []
    
    # for each method, calculate the sufficency and comprehensiveness
    for i, instance in enumerate(instances_rationalle_removed):
        print("Currently interpreting instance: ", i)
        
        print(instances_rationalle_removed[i])
        sufficency = calculate_sufficency(predictions, instances_rationalle_removed[i], model, tokenizer)
        print("Sufficency for iteration: ", sufficency)
        
        comprehensiveness = calculate_comprehensiveness(predictions, instances_other_removed[i], model, tokenizer)
        print("Comprehensiveness for iteration: ", comprehensiveness)
        
        # calculate faithfulness
        faithfulness = sufficency * comprehensiveness
        print("Faithfulness for iteration: ", faithfulness)
        faithfulness_calc.append(faithfulness)
    
    # return the minimum index of the faithfulness_calc to get the best method
    return np.argmin(faithfulness_calc), faithfulness_calc
        
    
    

### Test code for faithfulness calculation

In [None]:
# create a test with 10 instances for faithfulness evaluation
# from transformers import AutoTokenizer

# get the lime evaluations of each instance
from faithfulness import calculate_faithfulness, remove_other_words, remove_rationale_words

samples_start = 0
samples_end = 2

instances = untokenized_dataset["train"][samples_start:samples_end]["text"]
print(len(instances))

print(instances)
explainer = LimeTextExplainer(class_names=classes, bow=False)

indexed_text, index_array_rationalle = lime_create_index_arrays(instances, predictor_opt, explainer)
print(indexed_text)
print(index_array_rationalle)

# # remove the rationale words
rationalle_removed = remove_rationale_words(indexed_text, index_array_rationalle)
others_removed = remove_other_words(indexed_text, index_array_rationalle)

print(rationalle_removed)
print(others_removed)

# # concatenate the rationalle_removed and others_removed to tokenize them
rationalle_removed = [''.join(rationalle_removed[i].tolist()) for i in range(len(rationalle_removed))]
others_removed = [''.join(others_removed[i].tolist()) for i in range(len(others_removed))]

print(rationalle_removed)

print(len(rationalle_removed))
print(len(others_removed))

# # the extra list is needed since the function expects a list of instances each coming from a different interpretability method
ind, faith = calculate_faithfulness(instances, [rationalle_removed], [others_removed], model, tokenizer=tokenizer, predictor_model)
print(ind)
print(faith)
