# ImpPres with LLM

You have to implement in this notebook a better ImpPres classifier using an LLM.
This classifier must be implemented using DSPy.


In [1]:
# Configure the DSPy environment with the language model - for grok the parameters must be:
# env variable should be in os.environ['XAI_API_KEY']
# "xai/grok-3-mini"

import os
import dspy

from dotenv import load_dotenv
load_dotenv("grok_key.ini") 

lm = dspy.LM('xai/grok-3-mini', api_key=os.environ['XAI_API_KEY'])
# for ollama 
# lm = dspy.LM('ollama_chat/devstral', api_base='http://localhost:11434', api_key='')
dspy.configure(lm=lm)

In [7]:
from typing import Literal, List

## Implement the DSPy classifier program.
class ParadigmClassifier(dspy.Signature):
    pairs: str = dspy.InputField(desc="All premise-hypothesis pairs from paradigm, numbered and separated by |")
    predictions: List[Literal['entailment', 'neutral', 'contradiction']] = dspy.OutputField(desc="List of predictions for each pair")

classifier = dspy.Predict(ParadigmClassifier)

def classify(paradigm_pairs):
    pairs = []
    for i, pair in enumerate(paradigm_pairs):
        s = f"{i + 1}. Premise: {pair['premise']}, Hypothesis: {pair['hypothesis']}"
        pairs.append(s)

    pairs_str = " | ".join(pairs)
    results = classifier(pairs=pairs_str)

    return results.predictions

## Load ImpPres Dataset

In [2]:
from datasets import load_dataset

sections = ['presupposition_all_n_presupposition', 
            'presupposition_both_presupposition', 
            'presupposition_change_of_state', 
            'presupposition_cleft_existence', 
            'presupposition_cleft_uniqueness', 
            'presupposition_only_presupposition', 
            'presupposition_possessed_definites_existence', 
            'presupposition_possessed_definites_uniqueness', 
            'presupposition_question_presupposition']

dataset = {}
for section in sections:
    print(f"Loading dataset for section: {section}")
    dataset[section] = load_dataset("facebook/imppres", section)

Loading dataset for section: presupposition_all_n_presupposition
Loading dataset for section: presupposition_both_presupposition
Loading dataset for section: presupposition_change_of_state
Loading dataset for section: presupposition_cleft_existence
Loading dataset for section: presupposition_cleft_uniqueness
Loading dataset for section: presupposition_only_presupposition
Loading dataset for section: presupposition_possessed_definites_existence
Loading dataset for section: presupposition_possessed_definites_uniqueness
Loading dataset for section: presupposition_question_presupposition


## Evaluate Metrics

Let's use the huggingface `evaluate` package to compute the performance of the baseline.


In [3]:
from evaluate import load

accuracy = load("accuracy")
precision = load("precision")
recall = load("recall")
f1 = load("f1")


In [4]:
import evaluate
clf_metrics = evaluate.combine(["accuracy", "f1", "precision", "recall"])