# ImpPres Baseline

This notebook illustrates how to use the DeBERTa-v3-base-mnli-fever-anli model to perform specialized inference on the ImpPres dataset.

In [32]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

model_name = "MoritzLaurer/DeBERTa-v3-base-mnli-fever-anli"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)

In [33]:
label_names = ["entailment", "neutral", "contradiction"]
def evaluate(premise, hypothesis):
    input = tokenizer(premise, hypothesis, truncation=True, return_tensors="pt")
    output = model(input["input_ids"].to(device))
    prediction = torch.softmax(output["logits"][0], -1).tolist()
    prediction = {name: round(float(pred) * 100, 1) for pred, name in zip(prediction, label_names)}
    return prediction

In [34]:
evaluate("The weather is nice today.", "It is sunny outside.")

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


{'entailment': 0.1, 'neutral': 99.8, 'contradiction': 0.0}

In [35]:
def get_prediction(pred_dict):
    if pred_dict["entailment"] > pred_dict["contradiction"]  and pred_dict["entailment"] > pred_dict["neutral"]:
        return "entailment"
    elif pred_dict["contradiction"] > pred_dict["entailment"]:
        return "contradiction"
    else:
        return "neutral"

## Load ImpPres Dataset

In [36]:
from datasets import load_dataset

sections = ['presupposition_all_n_presupposition', 
            'presupposition_both_presupposition', 
            'presupposition_change_of_state', 
            'presupposition_cleft_existence', 
            'presupposition_cleft_uniqueness', 
            'presupposition_only_presupposition', 
            'presupposition_possessed_definites_existence', 
            'presupposition_possessed_definites_uniqueness', 
            'presupposition_question_presupposition']

dataset = {}
for section in sections:
    print(f"Loading dataset for section: {section}")
    dataset[section] = load_dataset("facebook/imppres", section)


Loading dataset for section: presupposition_all_n_presupposition
Loading dataset for section: presupposition_both_presupposition
Loading dataset for section: presupposition_change_of_state
Loading dataset for section: presupposition_cleft_existence
Loading dataset for section: presupposition_cleft_uniqueness
Loading dataset for section: presupposition_only_presupposition
Loading dataset for section: presupposition_possessed_definites_existence
Loading dataset for section: presupposition_possessed_definites_uniqueness
Loading dataset for section: presupposition_question_presupposition


In [37]:
dataset

{'presupposition_all_n_presupposition': DatasetDict({
     all_n_presupposition: Dataset({
         features: ['premise', 'hypothesis', 'trigger', 'trigger1', 'trigger2', 'presupposition', 'gold_label', 'UID', 'pairID', 'paradigmID'],
         num_rows: 1900
     })
 }),
 'presupposition_both_presupposition': DatasetDict({
     both_presupposition: Dataset({
         features: ['premise', 'hypothesis', 'trigger', 'trigger1', 'trigger2', 'presupposition', 'gold_label', 'UID', 'pairID', 'paradigmID'],
         num_rows: 1900
     })
 }),
 'presupposition_change_of_state': DatasetDict({
     change_of_state: Dataset({
         features: ['premise', 'hypothesis', 'trigger', 'trigger1', 'trigger2', 'presupposition', 'gold_label', 'UID', 'pairID', 'paradigmID'],
         num_rows: 1900
     })
 }),
 'presupposition_cleft_existence': DatasetDict({
     cleft_existence: Dataset({
         features: ['premise', 'hypothesis', 'trigger', 'trigger1', 'trigger2', 'presupposition', 'gold_label', 'UI

In [38]:
# Evaluate the model on the ImpPres dataset
from tqdm import tqdm
def evaluate_on_dataset(dataset):
    results = []
    label_names = ["entailment", "neutral", "contradiction"]
    for example in tqdm(dataset):
        premise = example['premise']
        hypothesis = example['hypothesis']
        prediction = evaluate(premise, hypothesis)
        results.append({
            'premise': premise,
            'hypothesis': hypothesis,
            'prediction': prediction,
            'pred_label': get_prediction(prediction),
            'gold_label': label_names[example['gold_label']],
        })
    return results

## Evaluate Metrics

Let's use the huggingface `evaluate` package to compute the performance of the baseline.


In [39]:
import evaluate as eval_lib

accuracy = eval_lib.load("accuracy")
precision = eval_lib.load("precision")
recall = eval_lib.load("recall")
f1 = eval_lib.load("f1")


In [40]:
clf_metrics = eval_lib.combine(["accuracy", "f1", "precision", "recall"])

In [41]:
clf_metrics.compute(predictions=[0, 1, 0], references=[0, 1, 1])

{'accuracy': 0.6666666666666666,
 'f1': 0.6666666666666666,
 'precision': 1.0,
 'recall': 0.5}

## Your Turn

Compute the classification metrics on the baseline model on each section of the ImpPres dataset.

https://www.kaggle.com/code/faijanahamadkhan/llm-evaluation-framework-hugging-face provides good documentation on how to use the Huggingface evaluate library.

In [43]:
print("=" * 80)
print("ðŸ“Š TASK 2.2: EVALUATING DEBERTA BASELINE ON IMPPRES PRESUPPOSITION DATA")
print("=" * 80)
print("ðŸ“‹ Model: DeBERTa-v3-base-mnli-fever-anli")
print("ðŸ“‹ Sections: 9 presupposition types")
print("ðŸ“‹ Metrics: Accuracy, Precision, Recall, F1 (using evaluate package)")
print("=" * 80)

def compute_metrics_on_section(dataset_section, section_name):
    """
    Compute classification metrics for a specific presupposition section
    """
    # Get the actual split name for this section
    split_name = list(dataset_section.keys())[0]
    section_data = dataset_section[split_name]
    
    print(f"\nðŸ”„ Evaluating on {section_name}...")
    print(f"Number of samples: {len(section_data)}")
    
    # Run evaluation on the section
    predictions = evaluate_on_dataset(section_data)
    
    # Extract predictions and gold labels
    pred_labels = [p['pred_label'] for p in predictions]
    gold_labels = [p['gold_label'] for p in predictions]
    
    # Map labels to integers for metrics computation
    label_to_int = {'entailment': 0, 'neutral': 1, 'contradiction': 2}
    pred_ints = [label_to_int[label] for label in pred_labels]
    gold_ints = [label_to_int[label] for label in gold_labels]
    
    # Compute metrics using evaluate package with macro averaging for multiclass
    accuracy_result = accuracy.compute(predictions=pred_ints, references=gold_ints)
    f1_result = f1.compute(predictions=pred_ints, references=gold_ints, average='macro')
    precision_result = precision.compute(predictions=pred_ints, references=gold_ints, average='macro')
    recall_result = recall.compute(predictions=pred_ints, references=gold_ints, average='macro')
    
    metrics = {
        'accuracy': accuracy_result['accuracy'],
        'f1': f1_result['f1'],
        'precision': precision_result['precision'],
        'recall': recall_result['recall']
    }
    
    # Add section information
    metrics['section'] = section_name
    metrics['num_samples'] = len(predictions)
    
    print(f"ðŸ“Š Results for {section_name}:")
    print(f"- Samples: {metrics['num_samples']}")
    print(f"- Accuracy: {metrics['accuracy']:.3f}")
    print(f"- F1 (macro): {metrics['f1']:.3f}")
    print(f"- Precision (macro): {metrics['precision']:.3f}")
    print(f"- Recall (macro): {metrics['recall']:.3f}")
    
    return metrics, predictions

# Evaluate each presupposition section
all_results = {}
all_predictions = []

for section in sections:
    metrics, predictions = compute_metrics_on_section(dataset[section], section)
    all_results[section] = metrics
    all_predictions.extend(predictions)

print("\n" + "=" * 80)
print("ðŸ“Š COMPUTING OVERALL METRICS ACROSS ALL SECTIONS")
print("=" * 80)


ðŸ“Š TASK 2.2: EVALUATING DEBERTA BASELINE ON IMPPRES PRESUPPOSITION DATA
ðŸ“‹ Model: DeBERTa-v3-base-mnli-fever-anli
ðŸ“‹ Sections: 9 presupposition types
ðŸ“‹ Metrics: Accuracy, Precision, Recall, F1 (using evaluate package)

ðŸ”„ Evaluating on presupposition_all_n_presupposition...
Number of samples: 1900


100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 1900/1900 [02:40<00:00, 11.84it/s]


ðŸ“Š Results for presupposition_all_n_presupposition:
- Samples: 1900
- Accuracy: 0.463
- F1 (macro): 0.420
- Precision (macro): 0.433
- Recall (macro): 0.475

ðŸ”„ Evaluating on presupposition_both_presupposition...
Number of samples: 1900


100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 1900/1900 [02:43<00:00, 11.62it/s]


ðŸ“Š Results for presupposition_both_presupposition:
- Samples: 1900
- Accuracy: 0.397
- F1 (macro): 0.316
- Precision (macro): 0.275
- Recall (macro): 0.394

ðŸ”„ Evaluating on presupposition_change_of_state...
Number of samples: 1900


100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 1900/1900 [02:45<00:00, 11.51it/s]


ðŸ“Š Results for presupposition_change_of_state:
- Samples: 1900
- Accuracy: 0.308
- F1 (macro): 0.316
- Precision (macro): 0.334
- Recall (macro): 0.324

ðŸ”„ Evaluating on presupposition_cleft_existence...
Number of samples: 1900


100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 1900/1900 [02:43<00:00, 11.65it/s]


ðŸ“Š Results for presupposition_cleft_existence:
- Samples: 1900
- Accuracy: 0.641
- F1 (macro): 0.630
- Precision (macro): 0.678
- Recall (macro): 0.694

ðŸ”„ Evaluating on presupposition_cleft_uniqueness...
Number of samples: 1900


100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 1900/1900 [02:39<00:00, 11.91it/s]


ðŸ“Š Results for presupposition_cleft_uniqueness:
- Samples: 1900
- Accuracy: 0.195
- F1 (macro): 0.191
- Precision (macro): 0.214
- Recall (macro): 0.186

ðŸ”„ Evaluating on presupposition_only_presupposition...
Number of samples: 1900


100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 1900/1900 [07:55<00:00,  4.00it/s]  


ðŸ“Š Results for presupposition_only_presupposition:
- Samples: 1900
- Accuracy: 0.583
- F1 (macro): 0.565
- Precision (macro): 0.646
- Recall (macro): 0.640

ðŸ”„ Evaluating on presupposition_possessed_definites_existence...
Number of samples: 1900


100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 1900/1900 [10:03<00:00,  3.15it/s]  


ðŸ“Š Results for presupposition_possessed_definites_existence:
- Samples: 1900
- Accuracy: 0.671
- F1 (macro): 0.664
- Precision (macro): 0.799
- Recall (macro): 0.737

ðŸ”„ Evaluating on presupposition_possessed_definites_uniqueness...
Number of samples: 1900


100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 1900/1900 [17:53<00:00,  1.77it/s]   


ðŸ“Š Results for presupposition_possessed_definites_uniqueness:
- Samples: 1900
- Accuracy: 0.390
- F1 (macro): 0.293
- Precision (macro): 0.252
- Recall (macro): 0.383

ðŸ”„ Evaluating on presupposition_question_presupposition...
Number of samples: 1900


100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 1900/1900 [18:22<00:00,  1.72it/s]    


ðŸ“Š Results for presupposition_question_presupposition:
- Samples: 1900
- Accuracy: 0.625
- F1 (macro): 0.599
- Precision (macro): 0.699
- Recall (macro): 0.695

ðŸ“Š COMPUTING OVERALL METRICS ACROSS ALL SECTIONS


In [None]:
# Compute overall metrics across all sections
print(f"Computing overall metrics on {len(all_predictions)} total samples...")

# Extract overall predictions and gold labels
overall_pred_labels = [p['pred_label'] for p in all_predictions]
overall_gold_labels = [p['gold_label'] for p in all_predictions]

# Map labels to integers
label_to_int = {'entailment': 0, 'neutral': 1, 'contradiction': 2}
overall_pred_ints = [label_to_int[label] for label in overall_pred_labels]
overall_gold_ints = [label_to_int[label] for label in overall_gold_labels]

# Compute overall metrics with macro averaging for multiclass
overall_accuracy_result = accuracy.compute(predictions=overall_pred_ints, references=overall_gold_ints)
overall_f1_result = f1.compute(predictions=overall_pred_ints, references=overall_gold_ints, average='macro')
overall_precision_result = precision.compute(predictions=overall_pred_ints, references=overall_gold_ints, average='macro')
overall_recall_result = recall.compute(predictions=overall_pred_ints, references=overall_gold_ints, average='macro')

overall_metrics = {
    'accuracy': overall_accuracy_result['accuracy'],
    'f1': overall_f1_result['f1'],
    'precision': overall_precision_result['precision'],
    'recall': overall_recall_result['recall']
}
overall_metrics['section'] = 'ALL_SECTIONS'
overall_metrics['num_samples'] = len(all_predictions)

print(f"ðŸ“Š Overall Results across all presupposition sections:")
print(f"- Total Samples: {overall_metrics['num_samples']}")
print(f"- Accuracy: {overall_metrics['accuracy']:.3f}")
print(f"- F1 (macro): {overall_metrics['f1']:.3f}")
print(f"- Precision (macro): {overall_metrics['precision']:.3f}")
print(f"- Recall (macro): {overall_metrics['recall']:.3f}")

# Add overall results to the results dictionary
all_results['ALL_SECTIONS'] = overall_metrics

print("\n" + "=" * 80)
print("ðŸ“‹ SUMMARY TABLE: DEBERTA BASELINE PERFORMANCE ON IMPPRES PRESUPPOSITIONS")
print("=" * 80)

# Create results table
print(f"{'Section':<45} {'Samples':<8} {'Accuracy':<9} {'F1':<6} {'Precision':<10} {'Recall':<6}")
print("-" * 90)

# Print individual section results
for section in sections:
    r = all_results[section]
    print(f"{section:<45} {r['num_samples']:<8} {r['accuracy']:<9.3f} {r['f1']:<6.3f} {r['precision']:<10.3f} {r['recall']:<6.3f}")

# Print overall results
print("-" * 90)
r = all_results['ALL_SECTIONS']
print(f"{'ALL_SECTIONS':<45} {r['num_samples']:<8} {r['accuracy']:<9.3f} {r['f1']:<6.3f} {r['precision']:<10.3f} {r['recall']:<6.3f}")

print("\n" + "=" * 80)
print("âœ… TASK 2.2 COMPLETED")
print("ðŸ“Š DeBERTa baseline evaluation completed on all 9 presupposition sections")
print("ðŸ“‹ Results show performance metrics for each section and overall performance")
print("=" * 80)
