# Evaluating Expainability

In [1]:
from utils import make_test_set_esnli, make_test_set_mnli, make_test_set_cose
from model import ZeroShotNLI, ZeroShotLearner
from explainer import Explainer

In [2]:
# number of explanations
NUM_EXPL = 5

#importing dataset
dataset_esnli = make_test_set_esnli(size=NUM_EXPL) # 9842 is dev size esnli
dataset_mnli = make_test_set_mnli(size=NUM_EXPL) # 9815 is dev size mnli
dataset_cose = make_test_set_cose(size=NUM_EXPL) # 1221 is dev size cose

Found cached dataset multi_nli (/Users/henningheyen/.cache/huggingface/datasets/multi_nli/default/0.0.0/591f72eb6263d1ab527561777936b199b714cda156d35716881158a2bd144f39)


  0%|          | 0/3 [00:00<?, ?it/s]

check3


Found cached dataset cos_e (/Users/henningheyen/.cache/huggingface/datasets/cos_e/v1.11/1.11.0/e8dc57a5b321a2a97063efb8d316d6d8a0d9a2d3a392dafc913e55bed42736d2)


  0%|          | 0/2 [00:00<?, ?it/s]

## Importing Models

In [3]:
# Natural Language Inference
xsmall = ZeroShotNLI(model_name='nli-deberta-v3-xsmall')
small = ZeroShotNLI(model_name='nli-deberta-v3-small')
base = ZeroShotNLI(model_name='nli-deberta-v3-base')
large = ZeroShotNLI(model_name='nli-deberta-v3-large')

models = [
    xsmall,
    small,
    base,
    large
]

model_names = [
    'xsmall',
    'small',
    'base',
    'large'
]

In [4]:
# Zero Shot Classification
xsmall = ZeroShotLearner(model_name='cross-encoder/nli-deberta-v3-xsmall')
small = ZeroShotLearner(model_name='cross-encoder/nli-deberta-v3-small')
base = ZeroShotLearner(model_name='cross-encoder/nli-deberta-v3-base')
large = ZeroShotLearner(model_name='cross-encoder/nli-deberta-v3-large')

models = [
    xsmall,
    small,
    base,
    large,
]

model_names = [
    'xsmall',
    'small',
    'base',
    'large',
]

# Faithfulness (Comprehensiveness and Sufficiency)

### Natural Language Inference

In [5]:
explainer = Explainer(class_names=['contradiction', 'entailment', 'neutral'])

### MNLI

In [6]:
%%time

# Computing LIME explanations on xsmall
explanations_xsmall = explainer.compute_explanations(
    sentences = dataset_mnli['sentence_pairs'], 
    model=xsmall, 
    num_samples=100,  
    task='NLI',
    )

CPU times: user 3min 49s, sys: 3.68 s, total: 3min 53s
Wall time: 3min 52s


In [7]:
%%time

# Computing LIME explanations on small
explanations_small = explainer.compute_explanations(
   sentences = dataset_mnli['sentence_pairs'], 
   model=small, 
   num_samples=100,  
   task='NLI',
   )

CPU times: user 7min 54s, sys: 8.85 s, total: 8min 3s
Wall time: 8min 6s


In [8]:
%%time

# Computing LIME explanations on base
explanations_base = explainer.compute_explanations(
   sentences = dataset_mnli['sentence_pairs'], 
   model=base, 
   num_samples=100,  
   task='NLI',
   )

CPU times: user 16min 21s, sys: 25.7 s, total: 16min 47s
Wall time: 16min 57s


In [8]:
%%time

# Computing LIME explanations on large
explanations_large = explainer.compute_explanations(
   sentences = dataset_mnli['sentence_pairs'], 
   model=large, 
   num_samples=100,  
   task='NLI',
   )

UsageError: Line magic function `%%time` not found.


In [None]:
explanations_mnli = [
    explanations_xsmall, 
    explanations_small, 
    explanations_base, 
    explanations_large,
    ]

In [None]:
# Calculating aggregated comprehensiveness and sufficiency on 100 explanations
comp_list = []
suff_list = []

for i, model in enumerate(models):
    
    print('model: ', model_names[i])

    comp_agg = [explainer.aggregated_metric(metric='comprehensiveness', explanation=explanations_mnli[i][j], sentence_pair=test_set['sentence_pairs'][j], predict=model.predict, verbose=False, bins=[0.1,0.3,0.5]) for j in range(size)]

    comp_list.append(comp_agg)

    suff_agg = [explainer.aggregated_metric(metric='sufficiency', explanation=explanations_mnli[i][j], sentence_pair=test_set['sentence_pairs'][j], predict=model.predict, verbose=False, bins=[0.1,0.3,0.5]) for j in range(size)]

    suff_list.append(suff_agg)


#### e-SNLI

# Plausibility (IOU and Token Level F1 Scores)

## CoS-e

## e-SNLI