In this workbook I want to calculate the automatic metrics from the input

In [5]:
import torch
from tqdm import tqdm
import os
import yaml
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from src.utils import (
    linearise_input, convert_to_features, form_stepwise_input, 
    simplify_feat_names,
    label_qs,
    simplify_narr_question,
    nums_to_names
)
from evaluate import load
from torchmetrics.text.infolm import InfoLM


In [3]:
dataset = load_dataset("james-burton/textual-explanations-702010")

# Form the linearised or stepwise (and linearised) input
dataset = dataset.map(
    lambda x: linearise_input(x, 'baseline_input', 20),
    load_from_cache_file=False
    ) 



Using custom data configuration james-burton--textual-explanations-702010-cac443d3271dff16
Found cached dataset parquet (/home/james/.cache/huggingface/datasets/james-burton___parquet/james-burton--textual-explanations-702010-cac443d3271dff16/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)
100%|██████████| 3/3 [00:00<00:00, 673.13it/s]
100%|██████████| 94/94 [00:00<00:00, 2725.87ex/s]
100%|██████████| 328/328 [00:00<00:00, 2790.71ex/s]
100%|██████████| 47/47 [00:00<00:00, 2837.74ex/s]


In [4]:
# Evaluate the predictions
bleurt = load('bleurt',checkpoint="bleurt-base-512")
bleu = load('bleu')
meteor = load('meteor')

results = {}
# preds = dataset['input']
# refs = dataset['narration']

narrs_w_names = [nums_to_names(narr, eval(c2s), eval(f2s))
                        for narr, c2s, f2s
                        in zip(dataset['test']['narration'],
                            dataset['test']['class2name'],
                            dataset['test']['ft_num2name'])]
input_w_names = [nums_to_names(inp, eval(c2s), eval(f2s))
                        for inp, c2s, f2s
                        in zip(dataset['test']['input'],
                            dataset['test']['class2name'],
                            dataset['test']['ft_num2name'])]

preds = narrs_w_names
refs = input_w_names
        
bleurt_results = bleurt.compute(predictions=preds,
                                references=refs)
bleu_results = bleu.compute(predictions=preds,
                            references=[[r] for r in refs])
meteor_results = meteor.compute(predictions=preds,
                                references=[[r] for r in refs])


Using default BLEURT-Base checkpoint for sequence maximum length 128. You can use a bigger model for better results with e.g.: evaluate.load('bleurt', 'bleurt-large-512').


INFO:tensorflow:Reading checkpoint /home/james/.cache/huggingface/metrics/bleurt/default/downloads/extracted/d6b01862d09a8feced08a9ee5a0c887edbafd7f450ddd83f5907da1bfbbc8754/bleurt-base-128.
INFO:tensorflow:Config file found, reading.
INFO:tensorflow:Will load checkpoint bert_custom
INFO:tensorflow:Loads full paths and checks that files exists.
INFO:tensorflow:... name:bert_custom
INFO:tensorflow:... vocab_file:vocab.txt
INFO:tensorflow:... bert_config_file:bert_config.json
INFO:tensorflow:... do_lower_case:True
INFO:tensorflow:... max_seq_length:128
INFO:tensorflow:Creating BLEURT scorer.
INFO:tensorflow:Creating WordPiece tokenizer.
INFO:tensorflow:WordPiece tokenizer instantiated.
INFO:tensorflow:Creating Eager Mode predictor.
INFO:tensorflow:Loading model.


2023-01-29 11:47:04.130256: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:981] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2023-01-29 11:47:04.130461: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcufft.so.10'; dlerror: libcufft.so.10: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: :/usr/local/cuda/lib64:/home/james/Downloads/TensorRT-8.5.1.7/lib
2023-01-29 11:47:04.143353: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcusparse.so.11'; dlerror: libcusparse.so.11: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: :/usr/local/cuda/lib64:/home/james/Downloads/TensorRT-8.5.1.7/lib
2023-01-29 11:47:04.143393: W tensorflow/core/common_runtime/gpu/gpu_device.cc:1934] Cannot dlopen some GPU libraries. 

INFO:tensorflow:BLEURT initialized.


[nltk_data] Downloading package wordnet to /home/james/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /home/james/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/james/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [6]:
infolm = InfoLM('google/bert_uncased_L-2_H-128_A-2', idf=False)
infolm(preds, refs)

Some weights of the model checkpoint at google/bert_uncased_L-2_H-128_A-2 were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
100%|██████████| 2/2 [00:01<00:00,  1.92it/s]
100%|██████████| 2/2 [00:01<00:00,  1.82it/s]


tensor(-5.7852)

## InfoLM but using the predictions from the model

In [7]:
runs = '''radiant-snake-164
        brilliant-paper-165
        beaming-kumquat-166
        dancing-goat-167
        vivid-peony-175
        prosperous-laughter-169
        incandescent-goat-171
        red-peony-172
        alight-rat-173
        red-laughter-174'''.split()

In [12]:
run_dict = {}
for i, run in enumerate(runs):
    model = 't5-base' if i % 2 == 0 else 'bart-base'
    with open(f'../models/{model}/{run}/test_predictions.txt') as f:
        preds = f.readlines()
    run_dict[run] = infolm(preds, refs)
print(run_dict)

100%|██████████| 2/2 [00:01<00:00,  1.83it/s]
100%|██████████| 2/2 [00:01<00:00,  1.88it/s]
100%|██████████| 2/2 [00:01<00:00,  1.87it/s]
100%|██████████| 2/2 [00:01<00:00,  1.83it/s]
100%|██████████| 2/2 [00:01<00:00,  1.81it/s]
100%|██████████| 2/2 [00:01<00:00,  1.80it/s]
100%|██████████| 2/2 [00:01<00:00,  1.78it/s]
100%|██████████| 2/2 [00:01<00:00,  1.85it/s]
100%|██████████| 2/2 [00:01<00:00,  1.81it/s]
100%|██████████| 2/2 [00:01<00:00,  1.84it/s]
100%|██████████| 2/2 [00:01<00:00,  1.85it/s]
100%|██████████| 2/2 [00:01<00:00,  1.80it/s]
100%|██████████| 2/2 [00:01<00:00,  1.86it/s]
100%|██████████| 2/2 [00:01<00:00,  1.78it/s]
100%|██████████| 2/2 [00:01<00:00,  1.79it/s]
100%|██████████| 2/2 [00:01<00:00,  1.84it/s]
100%|██████████| 2/2 [00:01<00:00,  1.56it/s]
100%|██████████| 2/2 [00:01<00:00,  1.85it/s]
100%|██████████| 2/2 [00:01<00:00,  1.77it/s]
100%|██████████| 2/2 [00:01<00:00,  1.84it/s]

{'radiant-snake-164': tensor(-5.5911), 'brilliant-paper-165': tensor(-4.5172), 'beaming-kumquat-166': tensor(-5.2534), 'dancing-goat-167': tensor(-4.8911), 'vivid-peony-175': tensor(-5.6877), 'prosperous-laughter-169': tensor(-4.3350), 'incandescent-goat-171': tensor(-4.7722), 'red-peony-172': tensor(-4.9977), 'alight-rat-173': tensor(-5.0640), 'red-laughter-174': tensor(-4.6484)}





In [14]:
refs

['Predicted class is "Return", value of 100.00%. Other classes and values are "Go Away" 0.00%. Top features are Perference(P2), Delay of delivery person picking up food, and Ease and convenient. Postive features are Ease and convenient. Negative features are Perference(P2) and Delay of delivery person picking up food. Lowest impact features are Educational Qualifications, Gender, and Good Quantity.',
 'Predicted class is "More", value of 91.95%. Other classes and values are "Less" 8.05%. Top features are X24, X8, and X1. Postive features are X24 and X1. Negative features are X8. Lowest impact features are X13, X12, and X26.',
 'Predicted class is "Stay", value of 89.16%. Other classes and values are "Leave" 10.84%. Top features are IsActiveMember, Age, and Geography. Postive features are Age and Geography. Negative features are IsActiveMember. Lowest impact features are Balance, EstimatedSalary, and HasCrCard.',
 'Predicted class is "Luxury", value of 97.02%. Other classes and values a

In [4]:
import numpy as np

In [5]:
print({'bleurt': np.mean(bleurt_results['scores']),
         'bleu': bleu_results['bleu'],
            'meteor': meteor_results['meteor']})



{'bleurt': -0.5074241114185846, 'bleu': 0.10726705757444158, 'meteor': 0.41644034699755955}


In [6]:
dataset['input']

['Predicted class is C1, value of 100.00%. Other classes and values are C2 0.00%. Top features are F40, F8, F5, and F22. Postive features are F5. Negative features are F40, F8, and F22. Lowest impact features are F31, F24, F34, F25, and F41.',
 'Predicted class is C1, value of 91.95%. Other classes and values are C2 8.05%. Top features are F6, F5, F11, and F26. Postive features are F6 and F11. Negative features are F5 and F26. Lowest impact features are F8, F9, F22, F17, and F12.',
 'Predicted class is C1, value of 89.16%. Other classes and values are C2 10.84%. Top features are F1, F5, F2, and F3. Postive features are F5 and F2. Negative features are F1 and F3. Lowest impact features are F6, F10, F4, F7, and F9.',
 'Predicted class is C1, value of 97.02%. Other classes and values are C2 2.98%. Top features are F10, F11, F13, and F6. Postive features are F10, F11, F13, and F6. Negative features are  . Lowest impact features are F3, F17, F4, F12, and F14.',
 'Predicted class is C2, valu

In [7]:
dataset.filter(lambda x: x['unique_id'] == 429)

100%|██████████| 1/1 [00:00<00:00, 169.11ba/s]


Dataset({
    features: ['model_name', 'predicted_class', 'task_name', 'narration', 'values', 'sign', 'narrative_id', 'unique_id', 'classes_dict', 'narrative_questions', 'feature_nums', 'ft_num2name', 'old2new_ft_nums', 'old2new_classes', 'predicted_class_label', 'class2name', 'input'],
    num_rows: 1
})

In [8]:
from src.utils import nums_to_names

x = dataset.filter(lambda x: x['unique_id'] == 429)[0]
nums_to_names(x['narration'], eval(x['class2name']), eval(x['ft_num2name']))

Loading cached processed dataset at /home/james/.cache/huggingface/datasets/james-burton___parquet/james-burton--textual-explanations-702010-cac443d3271dff16/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec/cache-a4ca6c3359638a88.arrow


'The case is labelled as "high quality" by the classifier, with the likelihood of this being correct equal to 94.37%, suggesting that there is a slight chance of about 5.63% that this decision could be wrong. The above prediction by the classifier is mainly based on the values of the features volatile acidity, sulphates, total sulfur dioxide, and alcohol, which, according to the analysis performed, offer very strong positive support for the prediction. The other variables with a positive influence on the decision are citric acid, fixed acidity, and density, further cementing the belief in the decision made here. The 5.63% likelihood of the "low_quality" can be blamed on the negative influence of chlorides, residual sugar, free sulfur dioxide, and pH, decreasing the likelihood of the "high quality" label assigned to the case under consideration. In summary, the confidence level of 94.37% in the "high quality" assignment is mainly due to the strong positive influence of sulphates, volati

In [9]:
aug = load_dataset("james-burton/aug-text-exps")

Using custom data configuration james-burton--aug-text-exps-a7fb5fbf61784010
Found cached dataset parquet (/home/james/.cache/huggingface/datasets/james-burton___parquet/james-burton--aug-text-exps-a7fb5fbf61784010/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)
100%|██████████| 3/3 [00:00<00:00, 791.58it/s]


In [10]:
x['class2name']

"{'C1': 'low_quality', 'C2': 'high quality'}"

In [11]:
x['input']

'Predicted class is C2, value of 94.37%. Other classes and values are C1 5.63%. Top features are F4, F6, F7, and F11. Postive features are F4, F6, F7, and F11. Negative features are  . Lowest impact features are F8, F9, F1, F3, and F5.'

  from .autonotebook import tqdm as notebook_tqdm
2023-01-29 11:45:48.466388: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-01-29 11:45:49.030709: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: :/usr/local/cuda/lib64:/home/james/Downloads/TensorRT-8.5.1.7/lib
2023-01-29 11:45:49.030758: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: :/usr/local/cu

tensor(-0.1784)

In [None]:
%    - SVM 96
%    - LogReg 90
%    - RF 82
%    - KNN 59
%    - Gradient Boosting 38
%    - Decision Tree 25
%    - MLP 15
%    _ DNN 13
%    - Adaboost 5
%    - GaussianNB 4

In [29]:
np.mean([96, 90, 82, 59, 38, 25, 15, 13, 5, 4])

42.7

In [30]:
np.std([96, 90, 82, 59, 38, 25, 15, 13, 5, 4])

34.33962725482034

In [31]:
dataset

Dataset({
    features: ['model_name', 'predicted_class', 'task_name', 'narration', 'values', 'sign', 'narrative_id', 'unique_id', 'classes_dict', 'narrative_questions', 'feature_nums', 'ft_num2name', 'old2new_ft_nums', 'old2new_classes', 'predicted_class_label', 'class2name', 'input'],
    num_rows: 375
})

In [32]:
dataset = load_dataset("james-burton/textual-explanations-702010")

Using custom data configuration james-burton--textual-explanations-702010-cac443d3271dff16
Found cached dataset parquet (/home/james/.cache/huggingface/datasets/james-burton___parquet/james-burton--textual-explanations-702010-cac443d3271dff16/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)
100%|██████████| 3/3 [00:00<00:00, 267.16it/s]


In [33]:
dataset['train']['narration']

["The prediction probability associated with class C2 and class C1, respectively, is 35.34% and 64.66%. Based on these probabilities, the model labels the given case as C1 since it is the most probable class. According to the attribution analysis, the most relevant features considered by the model here are F5, F1, and F8, while the least relevant features are F12, F2, and F4. Regarding the direction of influence of the features, F5, F1, F8, and F7 are the top positively supporting features, driving the decision higher in favour of C1. Further increasing the probability that C1 is the true label are the values of other positive features such as F16, F3, F15, and F14. To explain why the likelihood of C2 is 35.34%, we have to look at the negative contributions from F11, F6, F13, F2, F12, and F4. The abovementioned negative features contradict the model's decision with respect to the classification outcome.",
 "The classifier is very uncertain about the correct label for the case given.  R

In [37]:
narrs_w_names = [nums_to_names(narr, eval(c2s), eval(f2s))
                        for narr, c2s, f2s
                        in zip(dataset['test']['narration'],
                            dataset['test']['class2name'],
                            dataset['test']['ft_num2name'])]
narrs_w_names.extend([nums_to_names(narr, eval(c2s), eval(f2s))
                        for narr, c2s, f2s
                        in zip(dataset['train']['narration'],
                            dataset['train']['class2name'],
                            dataset['train']['ft_num2name'])])
narrs_w_names.extend([nums_to_names(narr, eval(c2s), eval(f2s))
                        for narr, c2s, f2s
                        in zip(dataset['validation']['narration'],
                            dataset['validation']['class2name'],
                            dataset['validation']['ft_num2name'])])
                 

In [38]:
len(narrs_w_names)

469

In [42]:
from nltk import sent_tokenize, word_tokenize
lens = [len(word_tokenize(narr)) for narr in narrs_w_names]

In [43]:
np.mean(lens)

187.61194029850745

In [44]:
np.std(lens)

47.216585146179

In [49]:
len({ word for narr in narrs_w_names for word in word_tokenize(narr) })

2466

In [54]:
tasks = set(dataset['train']['task_name'])
tasks.update(dataset['test']['task_name'])
tasks.update(dataset['validation']['task_name'])
len(tasks)

40

In [58]:
from collections import Counter
cnt = Counter(dataset['train']['task_name'])
cnt.update(dataset['test']['task_name'])
cnt.update(dataset['validation']['task_name'])
print(np.mean([t[1] for t in cnt.most_common()]))
print(np.std([t[1] for t in cnt.most_common()]))

11.725
3.3910728390879488
