# ImpPres Baseline

This notebook illustrates how to use the DeBERTa-v3-base-mnli-fever-anli model to perform specialized inference on the ImpPres dataset.

In [22]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

model_name = "MoritzLaurer/DeBERTa-v3-base-mnli-fever-anli"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)

In [33]:
label_names = ["entailment", "neutral", "contradiction"]
def evaluate(premise, hypothesis):
    input = tokenizer(premise, hypothesis, truncation=True, return_tensors="pt")
    output = model(input["input_ids"].to(device))
    prediction = torch.softmax(output["logits"][0], -1).tolist()
    prediction = {name: round(float(pred) * 100, 1) for pred, name in zip(prediction, label_names)}
    return prediction

In [24]:
evaluate("The weather is nice today.", "It is sunny outside.")

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


{'entailment': 0.1, 'neutral': 99.8, 'contradiction': 0.0}

In [25]:
def get_prediction(pred_dict):
    if pred_dict["entailment"] > pred_dict["contradiction"]  and pred_dict["entailment"] > pred_dict["neutral"]:
        return "entailment"
    elif pred_dict["contradiction"] > pred_dict["entailment"]:
        return "contradiction"
    else:
        return "neutral"

## Load ImpPres Dataset

In [26]:
from datasets import load_dataset

sections = ['presupposition_all_n_presupposition', 
            'presupposition_both_presupposition', 
            'presupposition_change_of_state', 
            'presupposition_cleft_existence', 
            'presupposition_cleft_uniqueness', 
            'presupposition_only_presupposition', 
            'presupposition_possessed_definites_existence', 
            'presupposition_possessed_definites_uniqueness', 
            'presupposition_question_presupposition']

dataset = {}
for section in sections:
    print(f"Loading dataset for section: {section}")
    dataset[section] = load_dataset("facebook/imppres", section)


Loading dataset for section: presupposition_all_n_presupposition
Loading dataset for section: presupposition_both_presupposition
Loading dataset for section: presupposition_change_of_state
Loading dataset for section: presupposition_cleft_existence
Loading dataset for section: presupposition_cleft_uniqueness
Loading dataset for section: presupposition_only_presupposition
Loading dataset for section: presupposition_possessed_definites_existence
Loading dataset for section: presupposition_possessed_definites_uniqueness
Loading dataset for section: presupposition_question_presupposition


In [27]:
dataset

{'presupposition_all_n_presupposition': DatasetDict({
     all_n_presupposition: Dataset({
         features: ['premise', 'hypothesis', 'trigger', 'trigger1', 'trigger2', 'presupposition', 'gold_label', 'UID', 'pairID', 'paradigmID'],
         num_rows: 1900
     })
 }),
 'presupposition_both_presupposition': DatasetDict({
     both_presupposition: Dataset({
         features: ['premise', 'hypothesis', 'trigger', 'trigger1', 'trigger2', 'presupposition', 'gold_label', 'UID', 'pairID', 'paradigmID'],
         num_rows: 1900
     })
 }),
 'presupposition_change_of_state': DatasetDict({
     change_of_state: Dataset({
         features: ['premise', 'hypothesis', 'trigger', 'trigger1', 'trigger2', 'presupposition', 'gold_label', 'UID', 'pairID', 'paradigmID'],
         num_rows: 1900
     })
 }),
 'presupposition_cleft_existence': DatasetDict({
     cleft_existence: Dataset({
         features: ['premise', 'hypothesis', 'trigger', 'trigger1', 'trigger2', 'presupposition', 'gold_label', 'UI

In [28]:
# Evaluate the model on the ImpPres dataset
from tqdm import tqdm
def evaluate_on_dataset(dataset):
    results = []
    label_names = ["entailment", "neutral", "contradiction"]
    for example in tqdm(dataset):
        premise = example['premise']
        hypothesis = example['hypothesis']
        prediction = evaluate(premise, hypothesis)
        results.append({
            'premise': premise,
            'hypothesis': hypothesis,
            'prediction': prediction,
            'pred_label': get_prediction(prediction),
            'gold_label': label_names[example['gold_label']],
        })
    return results

## Evaluate Metrics

Let's use the huggingface `evaluate` package to compute the performance of the baseline.


In [29]:
from evaluate import load

accuracy = load("accuracy")
precision = load("precision")
recall = load("recall")
f1 = load("f1")


In [30]:
import evaluate
clf_metrics = evaluate.combine(["accuracy", "f1", "precision", "recall"])

In [31]:
clf_metrics.compute(predictions=[0, 1, 0], references=[0, 1, 1])

{'accuracy': 0.6666666666666666,
 'f1': 0.6666666666666666,
 'precision': 1.0,
 'recall': 0.5}

## Your Turn

Compute the classification metrics on the baseline model on each section of the ImpPres dataset.

https://www.kaggle.com/code/faijanahamadkhan/llm-evaluation-framework-hugging-face provides good documentation on how to use the Huggingface evaluate library.

In [None]:
import pandas as pd

accuracies = []
precisions = []
recalls = []
f1s = []
results_table = []

for section in sections:
    print(f"Working on section: {section}")
    sec = section[15:]

    data = dataset[section]
    data = data[sec]
    
    section_results = evaluate_on_dataset(data)
    
    # Calculate metrics
    predictions = [result['pred_label'] for result in section_results]
    references = [result['gold_label'] for result in section_results]

    label_to_int = {"entailment": 0, "neutral": 1, "contradiction": 2}
    predictions_int = [label_to_int[pred] for pred in predictions]
    references_int = [label_to_int[ref] for ref in references]

    accuracy_score = accuracy.compute(predictions=predictions_int, references=references_int)['accuracy']
    f1_score = f1.compute(predictions=predictions_int, references=references_int, average='macro')['f1']
    precision_score = precision.compute(predictions=predictions_int, references=references_int, average='macro', zero_division=0)['precision']
    recall_score = recall.compute(predictions=predictions_int, references=references_int, average='macro', zero_division=0)['recall']
    
    accuracies.append(accuracy_score)
    precisions.append(precision_score)
    recalls.append(recall_score)
    f1s.append(f1_score)

    # Append results to the table
    results_table.append({
        'Section': sec,
        'Accuracy': f"{accuracy_score:.2f}",
        'Precision': f"{precision_score:.2f}",
        'Recall': f"{recall_score:.2f}",
        'F1': f"{f1_score:.2f}",
    })

# Calculate overall metrics
accuracy_all = sum(accuracies) / len(accuracies)
precision_all = sum(precisions) / len(precisions)
recall_all = sum(recalls) / len(recalls)
f1_all = sum(f1s) / len(f1s)

results_table.append({
    'Section': 'Overall',
    'Accuracy': f"{accuracy_all:.2f}",
    'Precision': f"{precision_all:.2f}",
    'Recall': f"{recall_all:.2f}",
    'F1': f"{f1_all:.2f}",
})

# Display results as a table
results_df = pd.DataFrame(results_table)
print("\n")
print(results_df.to_string(index=False))


Working on section: presupposition_all_n_presupposition


100%|██████████| 10/10 [00:02<00:00,  4.55it/s]


Working on section: presupposition_both_presupposition


100%|██████████| 10/10 [00:02<00:00,  4.73it/s]


Working on section: presupposition_change_of_state


100%|██████████| 10/10 [00:02<00:00,  4.66it/s]


Working on section: presupposition_cleft_existence


100%|██████████| 10/10 [00:02<00:00,  4.88it/s]


Working on section: presupposition_cleft_uniqueness


100%|██████████| 10/10 [00:03<00:00,  2.61it/s]


Working on section: presupposition_only_presupposition


100%|██████████| 10/10 [00:02<00:00,  4.45it/s]


Working on section: presupposition_possessed_definites_existence


100%|██████████| 10/10 [00:02<00:00,  4.05it/s]


Working on section: presupposition_possessed_definites_uniqueness


100%|██████████| 10/10 [00:02<00:00,  3.41it/s]


Working on section: presupposition_question_presupposition


100%|██████████| 10/10 [00:02<00:00,  4.14it/s]



Section                        Accuracy Precision Recall F1  
          all_n_presupposition 0.60     0.48      0.58   0.49
           both_presupposition 0.30     0.17      0.33   0.22
               change_of_state 0.00     0.00      0.00   0.00
               cleft_existence 0.60     0.50      0.58   0.51
              cleft_uniqueness 0.10     0.17      0.08   0.11
           only_presupposition 0.70     0.50      0.67   0.56
 possessed_definites_existence 0.70     0.50      0.67   0.56
possessed_definites_uniqueness 0.30     0.14      0.33   0.20
       question_presupposition 0.70     0.50      0.67   0.56
                       Overall 0.44     0.33      0.44   0.35



