# Example

In [1]:
from datasets import load_dataset

dataset = load_dataset("SetFit/mnli")

Found cached dataset json (/Users/henningheyen/.cache/huggingface/datasets/SetFit___json/SetFit--mnli-12154829fe6f4c49/0.0.0/8bb11242116d547c741b2e8a1f18598ffdd40a1d4f2a2872c7a28b697434bc96)


  0%|          | 0/3 [00:00<?, ?it/s]

In [2]:
# nli-deberta-v3-base for NLI
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch

model = AutoModelForSequenceClassification.from_pretrained('cross-encoder/nli-deberta-v3-base')
tokenizer = AutoTokenizer.from_pretrained('cross-encoder/nli-deberta-v3-base', use_fast= False)

features = tokenizer(['A man is eating pizza', 'A black race car starts up in front of a crowd of people.'], ['A man eats something', 'A man is driving down a lonely road.'],  padding=True, truncation=True, return_tensors="pt")

model.eval()
with torch.no_grad():
    scores = model(**features).logits
    label_mapping = ['contradiction', 'entailment', 'neutral']
    labels = [label_mapping[score_max] for score_max in scores.argmax(dim=1)]
    print(labels)


['entailment', 'contradiction']


# Example on MNLI

In [3]:
i = 41

premise1 = dataset['validation'][i]['text1']
hypothesis1 = dataset['validation'][i]['text2']
label1 = dataset['validation'][i]['label']
label_text1 = dataset['validation'][i]['label_text']
dataset['validation'][i]

{'text1': "What's truly striking, though, is that Jobs has never really let this idea go.",
 'text2': 'Jobs never held onto an idea for long.',
 'label': 2,
 'idx': 41,
 'label_text': 'contradiction'}

In [4]:
j = 32

premise2 = dataset['validation'][j]['text1']
hypothesis2 = dataset['validation'][j]['text2']
label2 = dataset['validation'][j]['label']
label_text2 = dataset['validation'][j]['label_text']
dataset['validation'][j]

{'text1': 'Conversely, an increase in government saving adds to the supply of resources available for investment and may put downward pressure on interest rates.',
 'text2': 'Interest rates should increase to increase saving.',
 'label': 2,
 'idx': 32,
 'label_text': 'contradiction'}

In [5]:
# Example on MNLI
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch

model = AutoModelForSequenceClassification.from_pretrained('cross-encoder/nli-deberta-v3-base')
tokenizer = AutoTokenizer.from_pretrained('cross-encoder/nli-deberta-v3-base', use_fast= False)

features = tokenizer([(premise1, hypothesis1), (premise2, hypothesis2)], padding=True, truncation=True, return_tensors="pt")

model.eval()
with torch.no_grad():
    scores = model(**features).logits
    print('scores: ', scores)
    label_mapping = ['contradiction', 'entailment', 'neutral']
    labels = [label_mapping[score_max] for score_max in scores.argmax(dim=1)]
    print('scores.argmax(dim=1): ', scores.argmax(dim=1))
    print('labels:', labels)

scores:  tensor([[ 4.7858, -2.6995, -1.9693],
        [ 4.6272, -3.1961, -0.7425]])
scores.argmax(dim=1):  tensor([0, 0])
labels: ['contradiction', 'contradiction']


# Evaluating on more MNLI samples

In [6]:
def make_test_set(size):
    
    # replacing labels according to last layer in the models
    new_labels = [2 if label == 1 else 1 if label == 0 else 0 for label in dataset['validation']['label']]

    test_set = [(dataset['validation'][i]['text1'], dataset['validation'][i]['text2']) for i in range(size)]
    test_labels = new_labels[:size]
    test_labels_text = dataset['validation'][:size]['label_text']

    return {'sentence_pairs': test_set, 'test_labels': test_labels, 'test_labels_text': test_labels_text}


In [13]:
test_set = make_test_set(500)

TypeError: make_test_set() got an unexpected keyword argument 'seed'

In [8]:
print(test_set['sentence_pairs'][:20])
print(test_set['test_labels'][:20])
print(test_set['test_labels_text'][:20])

[('The new rights are nice enough', 'Everyone really likes the newest benefits '), ('This site includes a list of all award winners and a searchable database of Government Executive articles.', 'The Government Executive articles housed on the website are not able to be searched.'), ("uh i don't know i i have mixed emotions about him uh sometimes i like him but at the same times i love to see somebody beat him", 'I like him for the most part, but would still enjoy seeing someone beat him.'), ("yeah i i think my favorite restaurant is always been the one closest  you know the closest as long as it's it meets the minimum criteria you know of good food", 'My favorite restaurants are always at least a hundred miles away from my house. '), ("i don't know um do you do a lot of camping", 'I know exactly.'), ("well that would be a help i wish they would do that here we have got so little landfill space left that we're going to run out before the end of this decade and it's really going to be", 

In [9]:
def get_predictions_mnli(model_name, test_set):

    model = AutoModelForSequenceClassification.from_pretrained(f'cross-encoder/{model_name}')
    tokenizer = AutoTokenizer.from_pretrained(f'cross-encoder/{model_name}', use_fast= False)

    features = tokenizer(test_set, padding=True, truncation=True, return_tensors="pt")

    model.eval()
    with torch.no_grad():
        scores = model(**features).logits
        label_mapping = ['contradiction', 'entailment', 'neutral']
        labels = [label_mapping[score_max] for score_max in scores.argmax(dim=1)]
        return scores.argmax(dim=1).tolist()

In [10]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix


In [87]:
%%time
deberta_xsmall_pred = get_predictions_mnli('nli-deberta-v3-xsmall', test_set['sentence_pairs'])

CPU times: user 4min 31s, sys: 1min 59s, total: 6min 30s
Wall time: 6min 31s


In [88]:
print('xsmall:', accuracy_score(test_set['test_labels'], deberta_xsmall_pred))


xsmall: 0.872


In [89]:
%%time
deberta_small_pred = get_predictions_mnli('nli-deberta-v3-small', test_set['sentence_pairs'])


CPU times: user 7min 9s, sys: 2min 29s, total: 9min 39s
Wall time: 11min 59s


In [90]:
print('small:', accuracy_score(test_set['test_labels'], deberta_small_pred))

small: 0.879


In [91]:
%%time
deberta_base_pred = get_predictions_mnli('nli-deberta-v3-base', test_set['sentence_pairs'])

CPU times: user 14min 40s, sys: 5min 12s, total: 19min 52s
Wall time: 1d 21h 28min 47s


In [92]:
print('base:', accuracy_score(test_set['test_labels'], deberta_base_pred))

base: 0.896


In [11]:
%%time
deberta_large_pred = get_predictions_mnli('nli-deberta-v3-large', test_set['sentence_pairs'])

CPU times: user 23min 19s, sys: 5min 7s, total: 28min 27s
Wall time: 1h 16min 17s


In [12]:
print('large:', accuracy_score(test_set['test_labels'], deberta_large_pred))

large: 0.898


In [14]:

print('xsmall:', accuracy_score(test_set['test_labels'], deberta_xsmall_pred))
#print('small:', accuracy_score(test_set['test_labels'], deberta_small_pred))
#print('base:', accuracy_score(test_set['test_labels'], deberta_base_pred))
#print('large:', accuracy_score(test_set['test_labels'], deberta_large_pred))

xsmall: 0.954


On 1000 validation examples. Accuracy scores:
- 0.872 xsmall
- 0.879 small
- 0.896 base
- 0.898 large (500)

In [None]:
# xsmall: 0.954 for 1000 train samples

In [None]:
model_names = ['deberta_xsmall', 'deberta_small', 'deberta_base', 'deberta_large']
predictions = [deberta_xsmall_pred, deberta_small_pred, deberta_base_pred, deberta_large_pred]
scores = [evaluate(test_labels, predicted_labels) for predicted_labels in predictions]

In [None]:
import pandas as pd

df = pd.DataFrame(scores)
df['model'] = model_names


# Explainability

In [5]:
from model import ZeroShotNLI
from utils import make_test_set
from lime.lime_text import LimeTextExplainer

In [6]:
test_set = make_test_set(10)

Found cached dataset json (/Users/henningheyen/.cache/huggingface/datasets/SetFit___json/SetFit--mnli-12154829fe6f4c49/0.0.0/8bb11242116d547c741b2e8a1f18598ffdd40a1d4f2a2872c7a28b697434bc96)


  0%|          | 0/3 [00:00<?, ?it/s]

In [7]:
xsmall = ZeroShotNLI('nli-deberta-v3-small')

probs_xsmall = xsmall.predict_for_lime(sentence_pairs = test_set['sentence_pairs'])


In [14]:
class_names = ['contradiction', 'entailment', 'neutral']
explainer = LimeTextExplainer(class_names=class_names)

def show_lime(sentence_pairs, predict, num_samples=100, num_features=10):
  
  for sentence_pair in sentence_pairs:
    text_instance = sentence_pair[0] + " [SEP] " + sentence_pair[0]
    explanation = compute_explanation(text_instance, predict, num_samples=num_samples, num_features=num_features)
    print("sentence_pair: ", text_instance)
    print("LIME Explanation:")
    explanation.show_in_notebook(text=True)
    print('-----------------------')

def compute_explanation(sentence_pair, predict, num_samples=100, num_features=10):
  return explainer.explain_instance(sentence_pair, predict, num_samples=num_samples, num_features=num_features)

In [15]:
show_lime(test_set['sentence_pairs'][:2], xsmall.predict_for_lime, num_samples=10)

sentence_pair:  Impossible. [SEP] Impossible.
LIME Explanation:


AttributeError: 'Tensor' object has no attribute 'astype'