# ImpPres LLM Baseline

You have to implement in this notebook a baseline for ImpPres classification using an LLM.
This baseline must be implemented using DSPy.



In [2]:
# Configure the DSPy environment with the language model - for grok the parameters must be:
# env variable should be in os.environ['XAI_API_KEY']
# "xai/grok-3-mini"
import os
import dspy

from dotenv import load_dotenv
load_dotenv("grok_key.ini") 
lm = dspy.LM('xai/grok-3-mini', api_key=os.environ['XAI_API_KEY'])
# for ollama 
# lm = dspy.LM('ollama_chat/devstral', api_base='http://localhost:11434', api_key='')
dspy.configure(lm=lm)

In [3]:
from typing import Literal

## Implement the DSPy program to classify pairs (premise, hypothesis) as entailment, contradiction, or neutral.

class ImpPresClassifier(dspy.Signature):
    premise: str = dspy.InputField()
    hypothesis: str = dspy.InputField()
    label: Literal['entailment', 'neutral', 'contradiction'] = dspy.OutputField()

classifier = dspy.Predict(ImpPresClassifier)

def classify(premise, hypothesis):
    return classifier(premise=premise, hypothesis=hypothesis).label

In [None]:
class ImpPresCoTClassifier(dspy.Signature):
    premise: str = dspy.InputField()
    hypothesis: str = dspy.InputField()
    explanation: str = dspy.OutputField(desc="Explain the reasoning for the classification")
    label: Literal['entailment', 'neutral', 'contradiction'] = dspy.OutputField()

cot_classifier = dspy.Predict(ImpPresCoTClassifier)

def classify_cot(premise, hypothesis):
    return cot_classifier(premise=premise, hypothesis=hypothesis).label

## Load ImpPres dataset

In [4]:
from datasets import load_dataset

sections = ['presupposition_all_n_presupposition', 
            'presupposition_both_presupposition', 
            'presupposition_change_of_state', 
            'presupposition_cleft_existence', 
            'presupposition_cleft_uniqueness', 
            'presupposition_only_presupposition', 
            'presupposition_possessed_definites_existence', 
            'presupposition_possessed_definites_uniqueness', 
            'presupposition_question_presupposition']

dataset = {}
for section in sections:
    print(f"Loading dataset for section: {section}")
    dataset[section] = load_dataset("facebook/imppres", section)

Loading dataset for section: presupposition_all_n_presupposition
Loading dataset for section: presupposition_both_presupposition
Loading dataset for section: presupposition_change_of_state
Loading dataset for section: presupposition_cleft_existence
Loading dataset for section: presupposition_cleft_uniqueness
Loading dataset for section: presupposition_only_presupposition
Loading dataset for section: presupposition_possessed_definites_existence
Loading dataset for section: presupposition_possessed_definites_uniqueness
Loading dataset for section: presupposition_question_presupposition


In [14]:
dataset

{'presupposition_all_n_presupposition': DatasetDict({
     all_n_presupposition: Dataset({
         features: ['premise', 'hypothesis', 'trigger', 'trigger1', 'trigger2', 'presupposition', 'gold_label', 'UID', 'pairID', 'paradigmID'],
         num_rows: 1900
     })
 }),
 'presupposition_both_presupposition': DatasetDict({
     both_presupposition: Dataset({
         features: ['premise', 'hypothesis', 'trigger', 'trigger1', 'trigger2', 'presupposition', 'gold_label', 'UID', 'pairID', 'paradigmID'],
         num_rows: 1900
     })
 }),
 'presupposition_change_of_state': DatasetDict({
     change_of_state: Dataset({
         features: ['premise', 'hypothesis', 'trigger', 'trigger1', 'trigger2', 'presupposition', 'gold_label', 'UID', 'pairID', 'paradigmID'],
         num_rows: 1900
     })
 }),
 'presupposition_cleft_existence': DatasetDict({
     cleft_existence: Dataset({
         features: ['premise', 'hypothesis', 'trigger', 'trigger1', 'trigger2', 'presupposition', 'gold_label', 'UI

## Evaluate Metrics

Let's use the huggingface `evaluate` package to compute the performance of the baseline.


In [23]:
from evaluate import load

accuracy = load("accuracy")
precision = load("precision")
recall = load("recall")
f1 = load("f1")

In [15]:
import evaluate
clf_metrics = evaluate.combine(["accuracy", "f1", "precision", "recall"])

In [None]:
from tqdm import tqdm

def evaluate_on_section(dataset, use_cot):
    results = []
    label_names = ["entailment", "neutral", "contradiction"]
    for example in tqdm(dataset):
        premise = example['premise']
        hypothesis = example['hypothesis']
        if use_cot:
            prediction = classify_cot(premise, hypothesis)
        else:
            prediction = classify(premise, hypothesis)
        results.append({
            'pred_label': prediction,
            'gold_label': label_names[example['gold_label']],
        })
    return results

### Basic classifier

In [None]:
import pandas as pd
from IPython.display import display

accuracies = []
precisions = []
recalls = []
f1s = []
results_table = []

for section in sections:
    print(f"Working on section: {section}")
    sec = section[15:]

    data = dataset[section]
    data = data[sec].select(range(40))

    section_results = evaluate_on_section(data, False)
    
    # Calculate metrics
    predictions = [result['pred_label'] for result in section_results]
    references = [result['gold_label'] for result in section_results]

    label_to_int = {"entailment": 0, "neutral": 1, "contradiction": 2}
    predictions_int = [label_to_int[pred] for pred in predictions]
    references_int = [label_to_int[ref] for ref in references]

    accuracy_score = accuracy.compute(predictions=predictions_int, references=references_int)['accuracy']
    f1_score = f1.compute(predictions=predictions_int, references=references_int, average='macro')['f1']
    precision_score = precision.compute(predictions=predictions_int, references=references_int, average='macro', zero_division=0)['precision']
    recall_score = recall.compute(predictions=predictions_int, references=references_int, average='macro', zero_division=0)['recall']
    
    accuracies.append(accuracy_score)
    precisions.append(precision_score)
    recalls.append(recall_score)
    f1s.append(f1_score)

    # Append results to the table
    results_table.append({
        'Section': sec,
        'Accuracy': f"{accuracy_score:.2f}",
        'Precision': f"{precision_score:.2f}",
        'Recall': f"{recall_score:.2f}",
        'F1': f"{f1_score:.2f}",
    })

# Calculate overall metrics
accuracy_all = sum(accuracies) / len(accuracies)
precision_all = sum(precisions) / len(precisions)
recall_all = sum(recalls) / len(recalls)
f1_all = sum(f1s) / len(f1s)

results_table.append({
    'Section': 'Overall',
    'Accuracy': f"{accuracy_all:.2f}",
    'Precision': f"{precision_all:.2f}",
    'Recall': f"{recall_all:.2f}",
    'F1': f"{f1_all:.2f}",
})

# Display results as a table
results_df = pd.DataFrame(results_table)
styled_df = results_df.style.set_properties(**{'text-align': 'center'})
styled_df = styled_df.set_table_styles([dict(selector='th', props=[('text-align', 'center')])])
display(styled_df)


Working on section: presupposition_all_n_presupposition


100%|██████████| 40/40 [00:00<00:00, 532.29it/s]


Working on section: presupposition_both_presupposition


100%|██████████| 40/40 [02:42<00:00,  4.07s/it]


Working on section: presupposition_change_of_state


100%|██████████| 40/40 [02:30<00:00,  3.77s/it]


Working on section: presupposition_cleft_existence


100%|██████████| 40/40 [02:35<00:00,  3.89s/it]


Working on section: presupposition_cleft_uniqueness


100%|██████████| 40/40 [02:38<00:00,  3.97s/it]


Working on section: presupposition_only_presupposition


100%|██████████| 40/40 [02:41<00:00,  4.03s/it]


Working on section: presupposition_possessed_definites_existence


100%|██████████| 40/40 [02:28<00:00,  3.72s/it]


Working on section: presupposition_possessed_definites_uniqueness


100%|██████████| 40/40 [02:48<00:00,  4.21s/it]


Working on section: presupposition_question_presupposition


100%|██████████| 40/40 [02:34<00:00,  3.87s/it]


Unnamed: 0,Section,Accuracy,Precision,Recall,F1
0,all_n_presupposition,0.97,0.98,0.97,0.97
1,both_presupposition,1.0,1.0,1.0,1.0
2,change_of_state,0.6,0.69,0.55,0.53
3,cleft_existence,0.68,0.85,0.63,0.63
4,cleft_uniqueness,0.45,0.47,0.38,0.29
5,only_presupposition,0.75,0.87,0.72,0.74
6,possessed_definites_existence,0.95,0.96,0.94,0.95
7,possessed_definites_uniqueness,0.45,0.47,0.38,0.29
8,question_presupposition,0.82,0.9,0.8,0.81
9,Overall,0.74,0.8,0.71,0.69


### Chain of Thought Classifier

In [None]:
import pandas as pd
from IPython.display import display

accuracies = []
precisions = []
recalls = []
f1s = []
results_table = []

for section in sections:
    print(f"Working on section: {section}")
    sec = section[15:]

    data = dataset[section]
    data = data[sec].select(range(10))

    section_results = evaluate_on_section(data, True)
    
    # Calculate metrics
    predictions = [result['pred_label'] for result in section_results]
    references = [result['gold_label'] for result in section_results]

    label_to_int = {"entailment": 0, "neutral": 1, "contradiction": 2}
    predictions_int = [label_to_int[pred] for pred in predictions]
    references_int = [label_to_int[ref] for ref in references]

    accuracy_score = accuracy.compute(predictions=predictions_int, references=references_int)['accuracy']
    f1_score = f1.compute(predictions=predictions_int, references=references_int, average='macro')['f1']
    precision_score = precision.compute(predictions=predictions_int, references=references_int, average='macro', zero_division=0)['precision']
    recall_score = recall.compute(predictions=predictions_int, references=references_int, average='macro', zero_division=0)['recall']
    
    accuracies.append(accuracy_score)
    precisions.append(precision_score)
    recalls.append(recall_score)
    f1s.append(f1_score)

    # Append results to the table
    results_table.append({
        'Section': sec,
        'Accuracy': f"{accuracy_score:.2f}",
        'Precision': f"{precision_score:.2f}",
        'Recall': f"{recall_score:.2f}",
        'F1': f"{f1_score:.2f}",
    })

# Calculate overall metrics
accuracy_all = sum(accuracies) / len(accuracies)
precision_all = sum(precisions) / len(precisions)
recall_all = sum(recalls) / len(recalls)
f1_all = sum(f1s) / len(f1s)

results_table.append({
    'Section': 'Overall',
    'Accuracy': f"{accuracy_all:.2f}",
    'Precision': f"{precision_all:.2f}",
    'Recall': f"{recall_all:.2f}",
    'F1': f"{f1_all:.2f}",
})

# Display results as a table
results_df = pd.DataFrame(results_table)
styled_df = results_df.style.set_properties(**{'text-align': 'center'})
styled_df = styled_df.set_table_styles([dict(selector='th', props=[('text-align', 'center')])])
display(styled_df)
