<a href="https://colab.research.google.com/github/henningheyen/TransformersExplainability/blob/main/notebooks/Colab_Explainability_Evaluation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Global Explainability Evaluation

Structure:
- Computing explanations for all models and datasets
- evaluating global faithfulness (comprehensiveness and sufficiency)
- evaluating global plausibility (IOU and Token Level F1 scores)

## Cloning TransformerExplainability Repo

In [None]:
!git clone https://github.com/henningheyen/TransformersExplainability.git

Cloning into 'TransformersExplainability'...
remote: Enumerating objects: 235, done.[K
remote: Counting objects: 100% (12/12), done.[K
remote: Compressing objects: 100% (11/11), done.[K
remote: Total 235 (delta 2), reused 6 (delta 1), pack-reused 223[K
Receiving objects: 100% (235/235), 49.68 MiB | 19.98 MiB/s, done.
Resolving deltas: 100% (143/143), done.


In [None]:
%cd TransformersExplainability

/content/TransformersExplainability


In [None]:
# Installing Dependencies
!pip install lime
!pip install transformers
!pip install sentencepiece
!pip install datasets

Collecting lime
  Downloading lime-0.2.0.1.tar.gz (275 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m275.7/275.7 kB[0m [31m4.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: lime
  Building wheel for lime (setup.py) ... [?25l[?25hdone
  Created wheel for lime: filename=lime-0.2.0.1-py3-none-any.whl size=283834 sha256=d3532b841193ed1b7e11ba0a71ca1d3e245d6f1b2db4c493930fbb2367fb84a5
  Stored in directory: /root/.cache/pip/wheels/fd/a2/af/9ac0a1a85a27f314a06b39e1f492bee1547d52549a4606ed89
Successfully built lime
Installing collected packages: lime
Successfully installed lime-0.2.0.1
Collecting transformers
  Downloading transformers-4.32.1-py3-none-any.whl (7.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.5/7.5 MB[0m [31m17.2 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.15.1 (from transformers)
  Downloading huggingface_hub-0.16.4-py3-no

## Setup

In [None]:
from utils import make_test_set_esnli, make_test_set_mnli, make_test_set_cose
from model import ZeroShotNLI, ZeroShotLearner
from explainer import Explainer
import numpy as np


In [None]:
# number of explanations
NUM_EXPL = 100

#importing dataset
dataset_esnli = make_test_set_esnli(size=NUM_EXPL, remove_bad_explanations=True) # 9830 is dev size esnli (originally 9842 but 12 instance are wrongly annotated)
dataset_mnli = make_test_set_mnli(size=NUM_EXPL) # 9815 is dev size mnli
dataset_cose = make_test_set_cose(size=NUM_EXPL, remove_bad_explanations=True) # 718 (originally 1221 is dev size cose but 503 instances have bad explanations, i.e. the whole question highlighted))

In [None]:
#calculating average explanation length per dataset

full_dataset_esnli = make_test_set_esnli(size=9830, remove_bad_explanations=True) # 9830 is dev size esnli (originally 9842 but 12 instance are wrongly annotated)
full_dataset_cose = make_test_set_cose(size=718, remove_bad_explanations=True) # 718 (originally 1221 is dev size cose but 503 instances have bad explanations, i.e. the whole question highlighted))

len_esnli = [len(full_dataset_esnli['extractive_explanation'][i])/(len(full_dataset_esnli['sentence_pairs'][i][0].split())+len(full_dataset_esnli['sentence_pairs'][i][1].split())) for i in range(9830)]
len_cose = [len(full_dataset_cose['extractive_explanation'][i])/len(full_dataset_cose['question'][i].split()) for i in range(718)]

avg_len_esnli = np.mean(len_esnli)
avg_len_cose = np.mean(len_cose)

print('average explanation input ratio e-SNLI: ', avg_len_esnli)
print('average explanation input ratio CoS-e: ', avg_len_cose)

## Calculating Explanations

### Natural Language Inference

In [None]:
# Natural Language Inference
xsmall_nli = ZeroShotNLI(model_name='nli-deberta-v3-xsmall')
small_nli = ZeroShotNLI(model_name='nli-deberta-v3-small')
base_nli = ZeroShotNLI(model_name='nli-deberta-v3-base')
large_nli = ZeroShotNLI(model_name='nli-deberta-v3-large')

models_nli = [
    xsmall_nli,
    small_nli,
    base_nli,
    large_nli
]

model_names_nli = [
    'xsmall_nli',
    'small_nli',
    'base_nli',
    'large_nli'
]


Downloading (…)lve/main/config.json:   0%|          | 0.00/1.05k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/568M [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/418 [00:00<?, ?B/s]

Downloading spm.model:   0%|          | 0.00/2.46M [00:00<?, ?B/s]

Downloading (…)in/added_tokens.json:   0%|          | 0.00/18.0 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/156 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.05k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/738M [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/417 [00:00<?, ?B/s]

Downloading spm.model:   0%|          | 0.00/2.46M [00:00<?, ?B/s]

Downloading (…)in/added_tokens.json:   0%|          | 0.00/18.0 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/156 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.05k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/1.74G [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/418 [00:00<?, ?B/s]

Downloading spm.model:   0%|          | 0.00/2.46M [00:00<?, ?B/s]

Downloading (…)in/added_tokens.json:   0%|          | 0.00/18.0 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/156 [00:00<?, ?B/s]

#### MNLI

In [None]:
explainer_mnli = Explainer(class_names=['contradiction', 'entailment', 'neutral'])

In [None]:
%%time

# Computing LIME explanations on xsmall
explanations_xsmall_mnli = explainer_mnli.compute_explanations(
    sentences = dataset_mnli['sentence_pairs'],
    model=xsmall_nli,
    num_samples=100,
    task='NLI',
    )

CPU times: user 6min 28s, sys: 1min 38s, total: 8min 7s
Wall time: 2min 3s


2min 3s

In [None]:
%%time

# Computing LIME explanations on small
explanations_small_mnli = explainer_mnli.compute_explanations(
    sentences = dataset_mnli['sentence_pairs'],
    model=small_nli,
    num_samples=100,
    task='NLI',
    )

CPU times: user 8min 40s, sys: 1min 57s, total: 10min 37s
Wall time: 2min 40s


2min 40s

In [None]:
%%time

# Computing LIME explanations on base
explanations_base_mnli = explainer_mnli.compute_explanations(
    sentences = dataset_mnli['sentence_pairs'],
    model=base_nli,
    num_samples=100,
    task='NLI',
    )

CPU times: user 17min 20s, sys: 3min 54s, total: 21min 15s
Wall time: 5min 20s


5min 20s


In [None]:
%%time

# Computing LIME explanations on large
explanations_large_mnli = explainer_mnli.compute_explanations(
    sentences = dataset_mnli['sentence_pairs'],
    model=large_nli,
    num_samples=100,
    task='NLI',
    )

CPU times: user 54min 8s, sys: 8min 12s, total: 1h 2min 20s
Wall time: 15min 38s


15min 38s

In [None]:
explanations_mnli = [
    explanations_xsmall_mnli,
    #explanations_small_mnli,
    #explanations_base_mnli,
    #explanations_large_mnli,
    ]

#### e-SNLI

In [None]:
explainer_esnli = Explainer(class_names=['contradiction', 'entailment', 'neutral'])

In [None]:
%%time

# Computing LIME explanations on xsmall
explanations_xsmall_esnli = explainer_esnli.compute_explanations(
    sentences = dataset_esnli['sentence_pairs'],
    model=xsmall_nli,
    num_samples=100,
    task='NLI',
    )

CPU times: user 5min 38s, sys: 490 ms, total: 5min 39s
Wall time: 1min 25s


1min 8s

In [None]:
%%time

# Computing LIME explanations on small
explanations_small_esnli = explainer_esnli.compute_explanations(
    sentences = dataset_esnli['sentence_pairs'],
    model=small_nli,
    num_samples=100,
    task='NLI',
    )

CPU times: user 8min 20s, sys: 1.5 s, total: 8min 21s
Wall time: 2min 7s


1min 40s

In [None]:
%%time

# Computing LIME explanations on base
explanations_base_esnli = explainer_esnli.compute_explanations(
    sentences = dataset_esnli['sentence_pairs'],
    model=base_nli,
    num_samples=100,
    task='NLI',
    )

CPU times: user 17min 2s, sys: 6.13 s, total: 17min 8s
Wall time: 4min 20s


3min 35s

In [None]:
%%time

# Computing LIME explanations on large
explanations_large_esnli = explainer_esnli.compute_explanations(
    sentences = dataset_esnli['sentence_pairs'],
    model=large_nli,
    num_samples=100,
    task='NLI',
    )

CPU times: user 56min 14s, sys: 8min 50s, total: 1h 5min 5s
Wall time: 16min 35s


12min 45s

In [None]:
explanations_esnli = [
    explanations_xsmall_esnli,
    explanations_small_esnli,
    explanations_base_esnli,
    explanations_large_esnli,
    ]

### Zero Shot Classification

In [None]:
# Zero Shot Classification
#xsmall_zsc = ZeroShotLearner(model_name='cross-encoder/nli-deberta-v3-xsmall')
#small_zsc = ZeroShotLearner(model_name='cross-encoder/nli-deberta-v3-small')
#base_zsc = ZeroShotLearner(model_name='cross-encoder/nli-deberta-v3-base')
large_zsc = ZeroShotLearner(model_name='cross-encoder/nli-deberta-v3-large')

models_zsc = [
    #xsmall_zsc,
    #small_zsc,
    #base_zsc,
    large_zsc,
]

model_names_zsc = [
    #'xsmall_zsc',
    #'small_zsc',
    #'base_zsc',
    'large_zsc',
]

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.05k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/1.74G [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/418 [00:00<?, ?B/s]

Downloading spm.model:   0%|          | 0.00/2.46M [00:00<?, ?B/s]

Downloading (…)in/added_tokens.json:   0%|          | 0.00/18.0 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/156 [00:00<?, ?B/s]

#### CoS-e

In [None]:
# no class names becuase of zero shot classification setting
explainer_zsc = Explainer()

In [None]:
%%time

# Computing LIME explanations on xsmall
explanations_xsmall_cose = explainer_zsc.compute_explanations(
   sentences = dataset_cose['question'],
   model=xsmall_zsc,
   num_samples=100,
   class_names_list=dataset_cose['candidate_labels_list']
   )

CPU times: user 2h 29min 35s, sys: 17.8 s, total: 2h 29min 53s
Wall time: 37min 27s


34min 35s

In [None]:
%%time

# Computing LIME explanations on small
explanations_small_cose = explainer_zsc.compute_explanations(
   sentences = dataset_cose['question'],
   model=small_zsc,
   num_samples=100,
   class_names_list=dataset_cose['candidate_labels_list']
   )

CPU times: user 3h 16min 5s, sys: 14.7 s, total: 3h 16min 19s
Wall time: 49min 2s


44min 28s

In [None]:
%%time

# Computing LIME explanations on base
explanations_base_cose = explainer_zsc.compute_explanations(
   sentences = dataset_cose['question'],
   model=base_zsc,
   num_samples=100,
   class_names_list=dataset_cose['candidate_labels_list']
   )

CPU times: user 6h 18min 23s, sys: 28.4 s, total: 6h 18min 51s
Wall time: 1h 34min 38s


1h 27min 7s

In [None]:
%%time

# Computing LIME explanations on large
explanations_large_cose = explainer_zsc.compute_explanations(
   sentences = dataset_cose['question'],
   model=large_zsc,
   num_samples=100,
   class_names_list=dataset_cose['candidate_labels_list']
   )

CPU times: user 18h 34min 45s, sys: 4min 1s, total: 18h 38min 46s
Wall time: 4h 39min 25s


4h 35min 50s

In [None]:
explanations_cose = [
    #explanations_xsmall_cose,
    #explanations_small_cose,
    #explanations_base_cose,
    explanations_large_cose,
    ]

## Faithfulness (Comprehensiveness and Sufficiency)

### Natural Language Inference

#### MNLI

In [None]:
# Calculating aggregated comprehensiveness and sufficiency on 100 explanations
comp_list_mnli = []
suff_list_mnli = []

for i, model in enumerate(models_nli):

    print('model: ', model_names_nli[i])

    comp_agg = [explainer_mnli.aggregated_metric(metric='comprehensiveness', explanation=explanations_mnli[i][j], sentence=dataset_mnli['sentence_pairs'][j], predict=model.predict, verbose=False, bins=[0.1,0.3,0.5], task='NLI') for j in range(NUM_EXPL)]

    comp_list_mnli.append(comp_agg)

    suff_agg = [explainer_mnli.aggregated_metric(metric='sufficiency', explanation=explanations_mnli[i][j], sentence=dataset_mnli['sentence_pairs'][j], predict=model.predict, verbose=False, bins=[0.1,0.3,0.5], task='NLI') for j in range(NUM_EXPL)]

    suff_list_mnli.append(suff_agg)


model:  xsmall_nli


NameError: ignored

In [None]:
#new
for i, model in enumerate(models_nli):
    print(f'MNLI average aggregated comprehensiveness {model_names_nli[i]}: ', np.mean(comp_list_mnli[i]))

for i, model in enumerate(models_nli):
    print(f'MNLI average aggregated sufficiency {model_names_nli[i]}: ', np.mean(suff_list_mnli[i]))

- MNLI average aggregated comprehensiveness xsmall_nli:  0.78477615
- MNLI average aggregated comprehensiveness small_nli:  0.81709516
- MNLI average aggregated comprehensiveness base_nli:  0.79582304
- MNLI average aggregated comprehensiveness large_nli:  0.82296914

- MNLI average aggregated sufficiency xsmall_nli:  0.15966842
- MNLI average aggregated sufficiency small_nli:  0.13124801
- MNLI average aggregated sufficiency base_nli:  0.20492478
- MNLI average aggregated sufficiency large_nli:  0.18986347


#### e-SNLI

In [None]:
# Calculating aggregated comprehensiveness and sufficiency on 100 explanations
comp_list_esnli = []
suff_list_esnli = []

for i, model in enumerate(models_nli):

    print('model: ', model_names_nli[i])

    comp_agg = [explainer_esnli.aggregated_metric(metric='comprehensiveness', explanation=explanations_esnli[i][j], sentence=dataset_esnli['sentence_pairs'][j], predict=model.predict, verbose=False, bins=[0.1,0.3,0.5], task='NLI') for j in range(NUM_EXPL)]

    comp_list_esnli.append(comp_agg)

    suff_agg = [explainer_esnli.aggregated_metric(metric='sufficiency', explanation=explanations_esnli[i][j], sentence=dataset_esnli['sentence_pairs'][j], predict=model.predict, verbose=False, bins=[0.1,0.3,0.5], task='NLI') for j in range(NUM_EXPL)]

    suff_list_esnli.append(suff_agg)



model:  xsmall_nli
model:  small_nli
model:  base_nli
model:  large_nli


In [None]:
# before
for i, model in enumerate(models_nli):
    print(f'e-SNLI average aggregated comprehensiveness {model_names_nli[i]}: ', np.mean(comp_list_esnli[i]))

for i, model in enumerate(models_nli):
    print(f'e-SNLI average aggregated sufficiency {model_names_nli[i]}: ', np.mean(suff_list_esnli[i]))

e-SNLI average aggregated comprehensiveness xsmall_nli:  0.7361461
e-SNLI average aggregated comprehensiveness small_nli:  0.7417693
e-SNLI average aggregated comprehensiveness base_nli:  0.7741472
e-SNLI average aggregated comprehensiveness large_nli:  0.750328
e-SNLI average aggregated sufficiency xsmall_nli:  0.17438322
e-SNLI average aggregated sufficiency small_nli:  0.15779552
e-SNLI average aggregated sufficiency base_nli:  0.15916185
e-SNLI average aggregated sufficiency large_nli:  0.17779283


- e-SNLI average aggregated comprehensiveness xsmall_nli:  0.7259713
- e-SNLI average aggregated comprehensiveness small_nli:  0.72445035
- e-SNLI average aggregated comprehensiveness base_nli:  0.76369643
- e-SNLI average aggregated comprehensiveness large_nli:  0.77805173
- e-SNLI average aggregated sufficiency xsmall_nli:  0.14606126
- e-SNLI average aggregated sufficiency small_nli:  0.2013229
- e-SNLI average aggregated sufficiency base_nli:  0.18698779
- e-SNLI average aggregated sufficiency large_nli:  0.19604878

### Zero Shot Classification

### CoS-e

In [None]:
# Calculating aggregated comprehensiveness and sufficiency on 100 explanations
comp_list_cose = []
suff_list_cose = []

for i, model in enumerate(models_zsc):

    print('model: ', model_names_zsc[i])

    comp_agg = [explainer_zsc.aggregated_metric(metric='comprehensiveness', explanation=explanations_cose[i][j], sentence=dataset_cose['question'][j], predict=model.predict, verbose=False, bins=[0.1,0.3,0.5], task='ZSC', candidate_labels=dataset_cose['candidate_labels_list'][j]) for j in range(NUM_EXPL)]

    comp_list_cose.append(comp_agg)

    suff_agg = [explainer_zsc.aggregated_metric(metric='sufficiency', explanation=explanations_cose[i][j], sentence=dataset_cose['question'][j], predict=model.predict, verbose=False, bins=[0.1,0.3,0.5], task='ZSC', candidate_labels=dataset_cose['candidate_labels_list'][j]) for j in range(NUM_EXPL)]

    suff_list_cose.append(suff_agg)



In [None]:
  for i, model in enumerate(models_zsc):
      print(f'CoS-e average aggregated comprehensiveness {model_names_zsc[i]}: ', np.mean(comp_list_cose[i]))

  for i, model in enumerate(models_zsc):
      print(f'CoS-e average aggregated sufficiency {model_names_zsc[i]}: ', np.mean(suff_list_cose[i]))


CoS-e average aggregated comprehensiveness xsmall_zsc:  0.3244179148677116
CoS-e average aggregated comprehensiveness small_zsc:  0.3381004646808529
CoS-e average aggregated comprehensiveness base_zsc:  0.3505119280167856
CoS-e average aggregated sufficiency xsmall_zsc:  -0.09536589091022808
CoS-e average aggregated sufficiency small_zsc:  -0.12529814377427104
CoS-e average aggregated sufficiency base_zsc:  -0.07018575119475522


In [None]:
  for i, model in enumerate(models_zsc):
      print(f'CoS-e average aggregated comprehensiveness {model_names_zsc[i]}: ', np.mean(comp_list_cose[i]))

  for i, model in enumerate(models_zsc):
      print(f'CoS-e average aggregated sufficiency {model_names_zsc[i]}: ', np.mean(suff_list_cose[i]))


CoS-e average aggregated comprehensiveness large_zsc:  0.40941937537495193
CoS-e average aggregated sufficiency large_zsc:  -0.03873680220295986


- CoS-e average aggregated comprehensiveness xsmall_zsc:  0.30408690341748296
- CoS-e average aggregated comprehensiveness small_zsc:  0.31567202631694574
- CoS-e average aggregated comprehensiveness base_zsc:  0.35605754521364963
- CoS-e average aggregated comprehensiveness large_zsc:  0.3910369690948089

- CoS-e average aggregated sufficiency xsmall_zsc:  -0.10679428329070408
- CoS-e average aggregated sufficiency small_zsc:  -0.14340662457669776
- CoS-e average aggregated sufficiency base_zsc:  -0.05904798991978169
- CoS-e average aggregated sufficiency large_zsc:  -0.07993169858741261


## Plausibility (IOU and Token Level F1 Scores)

### e-SNLI

In [None]:
explanation_tokens_lists = [explainer_esnli.get_explanation_list(explanations_esnli[i], top_percent=avg_len_esnli) for i in range(len(models_nli))]
ground_truth_list = dataset_esnli['extractive_explanation']

iou_scores_esnli, tokenf1_scores_esnli = [], []

for i, model in enumerate(models_nli):
  iou_scores_esnli.append([explainer_esnli.compute_instance_iou(explanation_tokens_lists[i][j], ground_truth_list[j]) for j in range(NUM_EXPL)])
  tokenf1_scores_esnli.append([explainer_esnli.compute_instance_f1(explanation_tokens_lists[i][j], ground_truth_list[j]) for j in range(NUM_EXPL)])

for i, model in enumerate(models_nli):
    print(f'{model_names_nli[i]} macro_iou for {NUM_EXPL} number of explanations (CoS-e): ', np.mean(iou_scores_esnli[i]))

for i, model in enumerate(models_nli):
    print(f'{model_names_nli[i]} macro_f1 for {NUM_EXPL} number of explanations (CoS-e): ', np.mean(tokenf1_scores_esnli[i]))

xsmall_nli macro_iou for 100 number of explanations (CoS-e):  0.28210005517358455
small_nli macro_iou for 100 number of explanations (CoS-e):  0.2919223994959289
base_nli macro_iou for 100 number of explanations (CoS-e):  0.2618828834564128
large_nli macro_iou for 100 number of explanations (CoS-e):  0.2842248470483764
xsmall_nli macro_f1 for 100 number of explanations (CoS-e):  0.41375777000777
small_nli macro_f1 for 100 number of explanations (CoS-e):  0.4253113830613831
base_nli macro_f1 for 100 number of explanations (CoS-e):  0.3852479742479742
large_nli macro_f1 for 100 number of explanations (CoS-e):  0.4156337273837274


- xsmall_nli macro_iou for 100 number of explanations (e-SNLI):  0.27125388337153045
- small_nli macro_iou for 100 number of explanations (e-SNLI):  0.25177114388879096
- base_nli macro_iou for 100 number of explanations (e-SNLI):  0.24971816908581615
- large_nli macro_iou for 100 number of explanations (e-SNLI):  0.24985357779475428
- xsmall_nli macro_f1 for 100 number of explanations (e-SNLI):  0.40168537018537015
- small_nli macro_f1 for 100 number of explanations (e-SNLI):  0.37554323454323446
- base_nli macro_f1 for 100 number of explanations (e-SNLI):  0.37382181707181694
- large_nli macro_f1 for 100 number of explanations (e-SNLI):  0.37055167055167054

### CoS-e

In [None]:
explanation_tokens_lists = [explainer_zsc.get_explanation_list(explanations_cose[i], top_percent=avg_len_cose) for i in range(len(models_zsc))]
ground_truth_list = dataset_cose['extractive_explanation']

iou_scores_cose, tokenf1_scores_cose = [], []

for i, model in enumerate(models_zsc):
  iou_scores_cose.append([explainer_zsc.compute_instance_iou(explanation_tokens_lists[i][j], ground_truth_list[j]) for j in range(NUM_EXPL)])
  tokenf1_scores_cose.append([explainer_zsc.compute_instance_f1(explanation_tokens_lists[i][j], ground_truth_list[j]) for j in range(NUM_EXPL)])

for i, model in enumerate(models_zsc):
    print(f'{model_names_zsc[i]} macro_iou for {NUM_EXPL} number of explanations (CoS-e): ', np.mean(iou_scores_cose[i]))

for i, model in enumerate(models_zsc):
    print(f'{model_names_zsc[i]} macro_f1 for {NUM_EXPL} number of explanations (CoS-e): ', np.mean(tokenf1_scores_cose[i]))

xsmall_zsc macro_iou for 100 number of explanations (CoS-e):  0.22590737040737038
small_zsc macro_iou for 100 number of explanations (CoS-e):  0.20926559551559554
base_zsc macro_iou for 100 number of explanations (CoS-e):  0.23180744255744254
xsmall_zsc macro_f1 for 100 number of explanations (CoS-e):  0.3292926289396877
small_zsc macro_f1 for 100 number of explanations (CoS-e):  0.3117965139435727
base_zsc macro_f1 for 100 number of explanations (CoS-e):  0.3372584964055552


In [None]:
explanation_tokens_lists = [explainer_zsc.get_explanation_list(explanations_cose[i], top_percent=avg_len_cose) for i in range(len(models_zsc))]
ground_truth_list = dataset_cose['extractive_explanation']

iou_scores_cose, tokenf1_scores_cose = [], []

for i, model in enumerate(models_zsc):
  iou_scores_cose.append([explainer_zsc.compute_instance_iou(explanation_tokens_lists[i][j], ground_truth_list[j]) for j in range(NUM_EXPL)])
  tokenf1_scores_cose.append([explainer_zsc.compute_instance_f1(explanation_tokens_lists[i][j], ground_truth_list[j]) for j in range(NUM_EXPL)])

for i, model in enumerate(models_zsc):
    print(f'{model_names_zsc[i]} macro_iou for {NUM_EXPL} number of explanations (CoS-e): ', np.mean(iou_scores_cose[i]))

for i, model in enumerate(models_zsc):
    print(f'{model_names_zsc[i]} macro_f1 for {NUM_EXPL} number of explanations (CoS-e): ', np.mean(tokenf1_scores_cose[i]))

large_zsc macro_iou for 100 number of explanations (CoS-e):  0.22011732711732712
large_zsc macro_f1 for 100 number of explanations (CoS-e):  0.31927603442309327


- xsmall_zsc macro_iou for 100 number of explanations (CoS-e):  0.23265864492404734
- small_zsc macro_iou for 100 number of explanations (CoS-e):  0.23059835798876044
- base_zsc macro_iou for 100 number of explanations (CoS-e):  0.23547574449114694
- large_zsc macro_iou for 100 number of explanations (CoS-e):  0.23037491476531724
- xsmall_zsc macro_f1 for 100 number of explanations (CoS-e):  0.35716010533154496
- small_zsc macro_f1 for 100 number of explanations (CoS-e):  0.35180869650645186
- base_zsc macro_f1 for 100 number of explanations (CoS-e):  0.36461320778464723
- large_zsc macro_f1 for 100 number of explanations (CoS-e):  0.3577848869826424


# Serialise objects

In [None]:
explainability_results = {}

In [None]:
explainability_results['mnli'] = {
        #'dataset': dataset_mnli,
        'xsmall': {
            'explanations_as_list':[explanation.as_list(label= explanation.top_labels[0]) for explanation in explanations_xsmall_mnli],
            'faithfulness': {
                'comprehensiveness': comp_list_mnli[0],
                'macro_comprehensiveness': np.mean(comp_list_mnli[0]),
                'sufficiency': suff_list_mnli[0],
                'macro_sufficiency': np.mean(suff_list_mnli[0]),
            },
        },
        'small': {
            'explanations_as_list':[explanation.as_list(label= explanation.top_labels[0]) for explanation in explanations_small_mnli],
            'faithfulness': {
                'comprehensiveness': comp_list_mnli[1],
                'macro_comprehensiveness': np.mean(comp_list_mnli[1]),
                'sufficiency': suff_list_mnli[1],
                'macro_sufficiency': np.mean(suff_list_mnli[1]),
            },
        },
        'base': {
            'explanations_as_list':[explanation.as_list(label= explanation.top_labels[0]) for explanation in explanations_base_mnli],
            'faithfulness': {
                'comprehensiveness': comp_list_mnli[2],
                'macro_comprehensiveness': np.mean(comp_list_mnli[2]),
                'sufficiency': suff_list_mnli[2],
                'macro_sufficiency': np.mean(suff_list_mnli[2]),
            },
        },
        'large': {
            'explanations_as_list':[explanation.as_list(label= explanation.top_labels[0]) for explanation in explanations_large_mnli],
            'faithfulness': {
                'comprehensiveness': comp_list_mnli[3],
                'macro_comprehensiveness': np.mean(comp_list_mnli[3]),
                'sufficiency': suff_list_mnli[3],
                'macro_sufficiency': np.mean(suff_list_mnli[3]),
            },
        },
  }

In [None]:
explainability_results['esnli'] = {
        #'dataset': dataset_esnli,
        'xsmall': {
            'explanations_as_list':[explanation.as_list(label= explanation.top_labels[0]) for explanation in explanations_xsmall_esnli],
            'faithfulness': {
                'comprehensiveness': comp_list_esnli[0],
                'macro_comprehensiveness': float(np.mean(comp_list_esnli[0])),
                'sufficiency': suff_list_esnli[0],
                'macro_sufficiency': float(np.mean(suff_list_esnli[0])),
            },
            'plausibility': {
                'iou': iou_scores_esnli[0],
                'macro_iou': float(np.mean(iou_scores_esnli[0])),
                'tokenf1': tokenf1_scores_esnli[0],
                'macro_tokenf1': float(np.mean(tokenf1_scores_esnli[0])),
            },
        },
        'small': {
            'explanations_as_list':[explanation.as_list(label= explanation.top_labels[0]) for explanation in explanations_small_esnli],
            'faithfulness': {
                'comprehensiveness': comp_list_esnli[1],
                'macro_comprehensiveness': float(np.mean(comp_list_esnli[1])),
                'sufficiency': suff_list_esnli[1],
                'macro_sufficiency': float(np.mean(suff_list_esnli[1])),
            },
            'plausibility': {
                'iou': iou_scores_esnli[1],
                'macro_iou': float(np.mean(iou_scores_esnli[1])),
                'tokenf1': tokenf1_scores_esnli[1],
                'macro_tokenf1': float(np.mean(tokenf1_scores_esnli[1])),
            },
        },
        'base': {
            'explanations_as_list':[explanation.as_list(label= explanation.top_labels[0]) for explanation in explanations_base_esnli],
            'faithfulness': {
                'comprehensiveness': comp_list_esnli[2],
                'macro_comprehensiveness': float(np.mean(comp_list_esnli[2])),
                'sufficiency': suff_list_esnli[2],
                'macro_sufficiency': float(np.mean(suff_list_esnli[2])),
            },
            'plausibility': {
                'iou': iou_scores_esnli[2],
                'macro_iou': float(np.mean(iou_scores_esnli[2])),
                'tokenf1': tokenf1_scores_esnli[2],
                'macro_tokenf1': float(np.mean(tokenf1_scores_esnli[2])),
            },
        },
        'large': {
            'explanations_as_list':[explanation.as_list(label= explanation.top_labels[0]) for explanation in explanations_large_esnli],
            'faithfulness': {
                'comprehensiveness': comp_list_esnli[3],
                'macro_comprehensiveness': float(np.mean(comp_list_esnli[3])),
                'sufficiency': suff_list_esnli[3],
                'macro_sufficiency': float(np.mean(suff_list_esnli[3])),
            },
            'plausibility': {
                'iou': iou_scores_esnli[3],
                'macro_iou': float(np.mean(iou_scores_esnli[3])),
                'tokenf1': tokenf1_scores_esnli[3],
                'macro_tokenf1': float(np.mean(tokenf1_scores_esnli[3])),
            },
        },
}

In [None]:
explainability_results['cose'] = {
        #'dataset': dataset_cose,
        'xsmall': {
            'explanations_as_list':[explanation.as_list(label= explanation.top_labels[0]) for explanation in explanations_xsmall_cose],
            'faithfulness': {
                'comprehensiveness': comp_list_cose[0],
                'macro_comprehensiveness': np.mean(comp_list_cose[0]),
                'sufficiency': suff_list_cose[0],
                'macro_sufficiency': np.mean(suff_list_cose[0]),
            },
            'plausibility': {
                'iou': iou_scores_cose[0],
                'macro_iou': np.mean(iou_scores_cose[0]),
                'tokenf1': tokenf1_scores_cose[0],
                'macro_tokenf1': np.mean(tokenf1_scores_cose[0]),
            },
        },
        'small': {
            'explanations_as_list':[explanation.as_list(label= explanation.top_labels[0]) for explanation in explanations_small_cose],
            'faithfulness': {
                'comprehensiveness': comp_list_cose[1],
                'macro_comprehensiveness': np.mean(comp_list_cose[1]),
                'sufficiency': suff_list_cose[1],
                'macro_sufficiency': np.mean(suff_list_cose[1]),
            },
            'plausibility': {
                'iou': iou_scores_cose[1],
                'macro_iou': np.mean(iou_scores_cose[1]),
                'tokenf1': tokenf1_scores_cose[1],
                'macro_tokenf1': np.mean(tokenf1_scores_cose[1]),
            },
        },
        'base': {
            'explanations_as_list':[explanation.as_list(label= explanation.top_labels[0]) for explanation in explanations_base_cose],
            'faithfulness': {
                'comprehensiveness': comp_list_cose[2],
                'macro_comprehensiveness': np.mean(comp_list_cose[2]),
                'sufficiency': suff_list_cose[2],
                'macro_sufficiency': np.mean(suff_list_cose[2]),
            },
            'plausibility': {
                'iou': iou_scores_cose[2],
                'macro_iou': np.mean(iou_scores_cose[2]),
                'tokenf1': tokenf1_scores_cose[2],
                'macro_tokenf1': np.mean(tokenf1_scores_cose[2]),
            },
        },
#        'large': {
#            'explanations_as_list':[explanation.as_list(label= explanation.top_labels[0]) for explanation in explanations_large_cose],
#            'explanations_as_html':[explanation.as_html(label= explanation.top_labels[0]) for explanation in explanations_large_cose],
#            'faithfulness': {
#                'comprehensiveness': comp_list_cose[3],
#                'macro_comprehensiveness': np.mean(comp_list_cose[3]),
#                'sufficiency': suff_list_cose[3],
#                'macro_sufficiency': np.mean(suff_list_cose[3]),
#            },
#            'plausibility': {
#                'iou': iou_scores_cose[3],
#                'macro_iou': np.mean(iou_scores_cose[3]),
#                'tokenf1': tokenf1_scores_cose[3],
#                'macro_tokenf1': np.mean(tokenf1_scores_cose[3]),
#            },
#        },
}



In [None]:
explainability_results = {}
explainability_results['cose'] = {
        #'dataset': dataset_cose,
        'large': {
            'explanations_as_list':[explanation.as_list(label= explanation.top_labels[0]) for explanation in explanations_large_cose],
            'faithfulness': {
                'comprehensiveness': comp_list_cose[0],
                'macro_comprehensiveness': np.mean(comp_list_cose[0]),
                'sufficiency': suff_list_cose[0],
                'macro_sufficiency': np.mean(suff_list_cose[0]),
            },
            'plausibility': {
                'iou': iou_scores_cose[0],
                'macro_iou': np.mean(iou_scores_cose[0]),
                'tokenf1': tokenf1_scores_cose[0],
                'macro_tokenf1': np.mean(tokenf1_scores_cose[0]),
            },
        }
}


In [None]:
# converting np.float32 types to float for serializing
def check_and_convert_types(obj):
    if isinstance(obj, dict):
        for key, value in obj.items():
            obj[key] = check_and_convert_types(value)
    elif isinstance(obj, list):
        for i in range(len(obj)):
            obj[i] = check_and_convert_types(obj[i])
    elif isinstance(obj, np.float32):  # Replace np.float32 with whatever type you want to check for
        return float(obj)  # Convert to Python native float
    return obj

explainability_results = check_and_convert_types(explainability_results)

In [None]:
import json

# Write to file
with open('explainability_results_esnli.json', 'w') as f:
    json.dump(explainability_results, f)


In [None]:
import json

# Write to file
with open('explainability_results_cose_large_2.json', 'w') as f:
    json.dump(explainability_results, f)


In [None]:
from google.colab import files

files.download('explainability_results_cose_large_2.json')


In [None]:
def print_key_tree(d, indent=0):
    for key, value in d.items():
        print('  ' * indent + str(key))
        if isinstance(value, dict):
            print_key_tree(value, indent + 1)
        elif isinstance(value, list):
            if all(isinstance(i, dict) for i in value):
                for sub_dict in value:
                    print_key_tree(sub_dict, indent + 1)

print_key_tree(explainability_results)


esnli
  xsmall
    explanations_as_list
    faithfulness
      comprehensiveness
      macro_comprehensiveness
      sufficiency
      macro_sufficiency
    plausibility
      iou
      macro_iou
      tokenf1
      macro_tokenf1
  small
    explanations_as_list
    faithfulness
      comprehensiveness
      macro_comprehensiveness
      sufficiency
      macro_sufficiency
    plausibility
      iou
      macro_iou
      tokenf1
      macro_tokenf1
  base
    explanations_as_list
    faithfulness
      comprehensiveness
      macro_comprehensiveness
      sufficiency
      macro_sufficiency
    plausibility
      iou
      macro_iou
      tokenf1
      macro_tokenf1
  large
    explanations_as_list
    faithfulness
      comprehensiveness
      macro_comprehensiveness
      sufficiency
      macro_sufficiency
    plausibility
      iou
      macro_iou
      tokenf1
      macro_tokenf1


In [None]:
from google.colab import files

files.download('explainability_results_esnli.json')


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

## Logging predictions

In [None]:
%%time

predictions_mnli, predictions_esnli, predictions_cose = [],[],[]

for model in models_zsc:
  result = model.get_results(dataset_cose['question'], candidate_labels_list=dataset_cose['candidate_labels_list'])
  pred = model.get_predictions(result, dataset_cose['candidate_labels_list'])
  predictions_cose.append(pred)



CPU times: user 20min 52s, sys: 3.2 s, total: 20min 55s
Wall time: 5min 13s


In [None]:
for model in models_nli:
  pred_mnli = model.get_predictions(dataset_mnli['sentence_pairs'])
  pred_esnli = model.get_predictions(dataset_esnli['sentence_pairs'])
  predictions_mnli.append(pred_mnli)
  predictions_esnli.append(pred_esnli)

In [None]:
import json
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

results = {
    'esnli': {
        'true': dataset_esnli['test_labels'],
          'xsmall': {
              'pred': predictions_esnli[0],
              'accuracy': accuracy_score(dataset_esnli['test_labels'], predictions_esnli[0]),
              'percision': precision_score(dataset_esnli['test_labels'], predictions_esnli[0], average='macro'),
              'recall': recall_score(dataset_esnli['test_labels'], predictions_esnli[0], average='macro'),
              'f1': f1_score(dataset_esnli['test_labels'], predictions_esnli[0], average='macro'),
          },
          'small': {
              'pred': predictions_esnli[1],
              'accuracy': accuracy_score(dataset_esnli['test_labels'], predictions_esnli[1]),
              'percision': precision_score(dataset_esnli['test_labels'], predictions_esnli[1], average='macro'),
              'recall': recall_score(dataset_esnli['test_labels'], predictions_esnli[1], average='macro'),
              'f1': f1_score(dataset_esnli['test_labels'], predictions_esnli[1], average='macro'),
          },
          'base': {
              'pred': predictions_esnli[2],
              'accuracy': accuracy_score(dataset_esnli['test_labels'], predictions_esnli[2]),
              'percision': precision_score(dataset_esnli['test_labels'], predictions_esnli[2], average='macro'),
              'recall': recall_score(dataset_esnli['test_labels'], predictions_esnli[2], average='macro'),
              'f1': f1_score(dataset_esnli['test_labels'], predictions_esnli[2], average='macro'),
          },
          'large': {
              'pred': predictions_esnli[3],
              'accuracy': accuracy_score(dataset_esnli['test_labels'], predictions_esnli[3]),
              'percision': precision_score(dataset_esnli['test_labels'], predictions_esnli[3], average='macro'),
              'recall': recall_score(dataset_esnli['test_labels'], predictions_esnli[3], average='macro'),
              'f1': f1_score(dataset_esnli['test_labels'], predictions_esnli[3], average='macro'),
          },
    },
        'mnli': {
          'true': dataset_mnli['test_labels'],
          'xsmall': {
              'pred': predictions_mnli[0],
              'accuracy': accuracy_score(dataset_mnli['test_labels'], predictions_mnli[0]),
              'percision': precision_score(dataset_mnli['test_labels'], predictions_mnli[0], average='macro'),
              'recall': recall_score(dataset_mnli['test_labels'], predictions_mnli[0], average='macro'),
              'f1': f1_score(dataset_mnli['test_labels'], predictions_mnli[0], average='macro'),
          },
          'small': {
              'pred': predictions_mnli[1],
              'accuracy': accuracy_score(dataset_mnli['test_labels'], predictions_mnli[1]),
              'percision': precision_score(dataset_mnli['test_labels'], predictions_mnli[1], average='macro'),
              'recall': recall_score(dataset_mnli['test_labels'], predictions_mnli[1], average='macro'),
              'f1': f1_score(dataset_mnli['test_labels'], predictions_mnli[1], average='macro'),
          },
          'base': {
              'pred': predictions_mnli[2],
              'accuracy': accuracy_score(dataset_mnli['test_labels'], predictions_mnli[2]),
              'percision': precision_score(dataset_mnli['test_labels'], predictions_mnli[2], average='macro'),
              'recall': recall_score(dataset_mnli['test_labels'], predictions_mnli[2], average='macro'),
              'f1': f1_score(dataset_mnli['test_labels'], predictions_mnli[2], average='macro'),
          },
          'large': {
              'pred': predictions_mnli[3],
              'accuracy': accuracy_score(dataset_mnli['test_labels'], predictions_mnli[3]),
              'percision': precision_score(dataset_mnli['test_labels'], predictions_mnli[3], average='macro'),
              'recall': recall_score(dataset_mnli['test_labels'], predictions_mnli[3], average='macro'),
              'f1': f1_score(dataset_mnli['test_labels'], predictions_mnli[3], average='macro'),
          },
    },
        'cose': {
          'true': dataset_cose['true_labels'],
          'xsmall': {
              'pred': predictions_cose[0],
              'accuracy': accuracy_score(dataset_cose['true_labels'], predictions_cose[0]),
          },
          'small': {
              'pred': predictions_cose[1],
              'accuracy': accuracy_score(dataset_cose['true_labels'], predictions_cose[1]),
          },
          'base': {
              'pred': predictions_cose[2],
              'accuracy': accuracy_score(dataset_cose['true_labels'], predictions_cose[2]),
          },
          'large': {
              'pred': predictions_cose[3],
              'accuracy': accuracy_score(dataset_cose['true_labels'], predictions_cose[3]),
          },
    },
}

# results are of type np.int64 which is not serializable so we convert it to int()
def convert_numpy_int(item):
    if isinstance(item, np.int64):
        return int(item)
    elif isinstance(item, list):
        return [convert_numpy_int(sub_item) for sub_item in item]
    elif isinstance(item, dict):
        return {key: convert_numpy_int(value) for key, value in item.items()}
    else:
        return item

results_converted = convert_numpy_int(results)


# Save the dictionary as a JSON file
with open('predictions_for_explanations.json', 'w') as json_file:
    json.dump(results_converted, json_file, indent=4)

from google.colab import files

files.download('predictions_for_explanations.json')


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>