# ANLI Baseline with LLM

You have to implement in this notebook a baseline for ANLI classification using an LLM.
This baseline must be implemented using DSPy.



In [13]:
# Configure the DSPy environment with the language model - for grok the parameters must be:
# env variable should be in os.environ['XAI_API_KEY']
# "xai/grok-3-mini"
import os
import dspy

lm = dspy.LM('xai/grok-3-mini', api_key=os.environ['XAI_API_KEY'])
# for ollama 
# lm = dspy.LM('ollama_chat/devstral', api_base='http://localhost:11434', api_key='')
dspy.configure(lm=lm)

In [14]:
from typing import Literal

class NLIClassifier(dspy.Signature):
    premise = dspy.InputField(desc="A factual statement")
    hypothesis = dspy.InputField(desc="A statement to evaluate against the premise")
    label = dspy.OutputField(
        desc="The relationship between premise and hypothesis: entailment, neutral, or contradiction",
        choices=["entailment", "neutral", "contradiction"]
    )


# Create a Predict module
nli_predict = dspy.Predict(NLIClassifier)


In [17]:
def predict_batch(batch):
    # Extract lists from the dataset batch
    uids = list(batch["uid"])  # Keep as list
    premises = list(batch["premise"])
    hypotheses = list(batch["hypothesis"])
    
    # Build model inputs
    inputs = [
        {"premise": p, "hypothesis": h}
        for p, h in zip(premises, hypotheses)
    ]
    print(inputs[:5])  # Print first 5 inputs for debugging
    
    # Vectorized DSPy prediction
    predictions = nli_predict.batch(inputs)
    labels = [pred.label for pred in predictions]
    
    # Pair each uid with its prediction
    return list(zip(uids, labels))


def create_batch(dataset, batch_size=32):
    # Create batches of the dataset
    for i in range(0, len(dataset), batch_size):
        yield dataset[i:i + batch_size]

## Load ANLI dataset

In [10]:
from datasets import load_dataset

dataset = load_dataset("facebook/anli")
dataset = dataset.filter(lambda x: x['reason'] != None and x['reason'] != "")

In [12]:
dataset

DatasetDict({
    train_r1: Dataset({
        features: ['uid', 'premise', 'hypothesis', 'label', 'reason'],
        num_rows: 2923
    })
    dev_r1: Dataset({
        features: ['uid', 'premise', 'hypothesis', 'label', 'reason'],
        num_rows: 1000
    })
    test_r1: Dataset({
        features: ['uid', 'premise', 'hypothesis', 'label', 'reason'],
        num_rows: 1000
    })
    train_r2: Dataset({
        features: ['uid', 'premise', 'hypothesis', 'label', 'reason'],
        num_rows: 4861
    })
    dev_r2: Dataset({
        features: ['uid', 'premise', 'hypothesis', 'label', 'reason'],
        num_rows: 1000
    })
    test_r2: Dataset({
        features: ['uid', 'premise', 'hypothesis', 'label', 'reason'],
        num_rows: 1000
    })
    train_r3: Dataset({
        features: ['uid', 'premise', 'hypothesis', 'label', 'reason'],
        num_rows: 13375
    })
    dev_r3: Dataset({
        features: ['uid', 'premise', 'hypothesis', 'label', 'reason'],
        num_rows: 1200


In [18]:
test_r3 = dataset['test_r3']
mini_test_r3 = test_r3.select(range(100))
# batches = create_batch(test_r3, batch_size=32)
predictions = predict_batch(mini_test_r3)
# for batch in batches:
#     batch_predictions = predict_batch(batch)
#     predictions.extend(batch_predictions)
#     print(f"Processed batch with {len(batch)} examples, total predictions: {len(predictions)}")
#     print(predictions[-5:])  # Print last 5 predictions for debugging


[{'premise': "It is Sunday today, let's take a look at the most popular posts of the last couple of days. Most of the articles this week deal with the iPhone, its future version called the iPhone 8 or iPhone Edition, and new builds of iOS and macOS. There are also some posts that deal with the iPhone rival called the Galaxy S8 and some other interesting stories. The list of the most interesting articles is available below. Stay tuned for more rumors and don't forget to follow us on Twitter.", 'hypothesis': 'The day of the passage is usually when Christians praise the lord together'}, {'premise': 'By The Associated Press WELLINGTON, New Zealand (AP) — All passengers and crew have survived a crash-landing of a plane in a lagoon in the Federated States of Micronesia. WELLINGTON, New Zealand (AP) — All passengers and crew have survived a crash-landing of a plane in a lagoon in the Federated States of Micronesia. Copyright © 2018 The Associated Press. All rights reserved. This material may 

AttributeError: 'dict' object has no attribute 'inputs'

## Evaluate Metrics

Let's use the huggingface `evaluate` package to compute the performance of the baseline.


In [26]:
from evaluate import load

accuracy = load("accuracy")
precision = load("precision")
recall = load("recall")
f1 = load("f1")


In [27]:
import evaluate
clf_metrics = evaluate.combine(["accuracy", "f1", "precision", "recall"])

In [29]:
clf_metrics.compute(predictions=[0, 1, 0], references=[0, 1, 1])

{'accuracy': 0.6666666666666666,
 'f1': 0.6666666666666666,
 'precision': 1.0,
 'recall': 0.5}

## Your Turn

Compute the classification metrics on the baseline LLM model on each test section of the ANLI dataset for samples that have a non-empty 'reason' field.

You also must show a comparison between the DeBERTa baseline model and this LLM baseline model. The comparison metric should compute the agreement between the two models:
* On how many samples they are both correct [Correct]
* On how many samples Model1 is correct and Model2 is incorrect [Correct1]
* On how many samples Model1 is incorrect and Model2 is correct [Correct2]
* On how many samples both are incorrect [Incorrect]