# ANLI Baseline with LLM

You have to implement in this notebook a baseline for ANLI classification using an LLM.
This baseline must be implemented using DSPy.



In [1]:
# Configure the DSPy environment with the language model - for grok the parameters must be:
# env variable should be in os.environ['XAI_API_KEY']
# "xai/grok-3-mini"
import os
import dspy

lm = dspy.LM('xai/grok-3-mini', api_key=os.environ['XAI_API_KEY'])
# for ollama 
# lm = dspy.LM('ollama_chat/devstral', api_base='http://localhost:11434', api_key='')
dspy.configure(lm=lm)

In [2]:
import re
from concurrent.futures import ThreadPoolExecutor, as_completed

class BatchedNLIPredictor(dspy.Module):
    """
    DSPy program to classify NLI examples in batches using a language model.
    """
    def __init__(self, model, batch_size=15, max_workers=6):
        super().__init__()
        self.model = model
        self.batch_size = batch_size
        self.max_workers = max_workers

    def _process_chunk(self, chunk_examples, batch_index):
        """Helper to process a single chunk of examples."""
        print(f"[Batch {batch_index}] Starting processing with {len(chunk_examples)} examples")

        # Build a single prompt for the chunk
        prompt = (
            "Classify the relationship between the hypothesis and premise: "
            "entailment / neutral / contradiction. **Provide a one-word answer**.\n\n"
        )
        for idx, ex in enumerate(chunk_examples, start=1):
            prompt += (
                f"Example {idx}:\n"
                f"Premise: {ex.premise}\n"
                f"Hypothesis: {ex.hypothesis}\n"
                f"Answer (entailment/neutral/contradiction):\n"
            )

        # Single LLM call
        response = self.model(prompt)
        response_text = "\n".join(response) if isinstance(response, list) else str(response)

        # Extract predictions
        predictions = []
        for line in response_text.splitlines():
            m = re.search(r"(entailment|neutral|contradiction)", line, re.IGNORECASE)
            if m:
                predictions.append(m.group(1).lower())

        # Pad any missing predictions
        while len(predictions) < len(chunk_examples):
            predictions.append("unknown")

        print(f"[Batch {batch_index}] Finished processing")

        return predictions

    def forward(self, examples):
        """
        Main pipeline: 
        - Splits examples into batches
        - Runs them in parallel
        - Returns predictions in the original order
        """
        if not examples:
            return []

        # Split into batches
        chunks = [
            examples[i:i+self.batch_size]
            for i in range(0, len(examples), self.batch_size)
        ]

        results_by_index = {}
        with ThreadPoolExecutor(max_workers=self.max_workers) as executor:
            # Submit each batch to executor
            futures = {
                executor.submit(self._process_chunk, chunk, idx): idx
                for idx, chunk in enumerate(chunks)
            }

            # Collect results
            for future in as_completed(futures):
                batch_index = futures[future]
                predictions = future.result()
                results_by_index[batch_index] = predictions

        # Flatten results in original order
        ordered_predictions = []
        for idx in sorted(results_by_index.keys()):
            ordered_predictions.extend(results_by_index[idx])

        return ordered_predictions


## Load ANLI dataset

In [5]:
from datasets import load_dataset

dataset = load_dataset("facebook/anli")
dataset = dataset.filter(lambda x: x['reason'] != None and x['reason'] != "")

In [6]:
dataset

DatasetDict({
    train_r1: Dataset({
        features: ['uid', 'premise', 'hypothesis', 'label', 'reason'],
        num_rows: 2923
    })
    dev_r1: Dataset({
        features: ['uid', 'premise', 'hypothesis', 'label', 'reason'],
        num_rows: 1000
    })
    test_r1: Dataset({
        features: ['uid', 'premise', 'hypothesis', 'label', 'reason'],
        num_rows: 1000
    })
    train_r2: Dataset({
        features: ['uid', 'premise', 'hypothesis', 'label', 'reason'],
        num_rows: 4861
    })
    dev_r2: Dataset({
        features: ['uid', 'premise', 'hypothesis', 'label', 'reason'],
        num_rows: 1000
    })
    test_r2: Dataset({
        features: ['uid', 'premise', 'hypothesis', 'label', 'reason'],
        num_rows: 1000
    })
    train_r3: Dataset({
        features: ['uid', 'premise', 'hypothesis', 'label', 'reason'],
        num_rows: 13375
    })
    dev_r3: Dataset({
        features: ['uid', 'premise', 'hypothesis', 'label', 'reason'],
        num_rows: 1200


In [22]:

# 1. Prepare DSPy examples from the test_r3 dataset
test_r3 = dataset['test_r3']
examples = [
    dspy.Example(premise=row["premise"], hypothesis=row["hypothesis"])
    for row in test_r3
]

# 2. Initialize the classification predictor
classifier = BatchedNLIPredictor(model=lm, batch_size=15, max_workers=6)

# 3. Run the predicted labels with their uids for debugging
uids = test_r3['uid']
ordered_llm_predictions = list(zip(uids, classifier(examples))) # triggers forward()



[Batch 0] Starting processing with 15 examples
[Batch 0] Finished processing
[Batch 1] Starting processing with 15 examples
[Batch 1] Finished processing
[Batch 2] Starting processing with 15 examples
[Batch 2] Finished processing
[Batch 3] Starting processing with 15 examples
[Batch 4] Starting processing with 15 examples
[Batch 3] Finished processing
[Batch 5] Starting processing with 15 examples
[Batch 4] Finished processing
[Batch 5] Finished processing
[Batch 6] Starting processing with 15 examples
[Batch 7] Starting processing with 15 examples
[Batch 8] Starting processing with 15 examples
[Batch 8] Finished processing
[Batch 9] Starting processing with 15 examples
[Batch 9] Finished processing
[Batch 7] Finished processing
[Batch 6] Finished processing
[Batch 10] Starting processing with 15 examples
[Batch 12] Starting processing with 15 examples
[Batch 13] Starting processing with 15 examples
[Batch 11] Starting processing with 15 examples
[Batch 14] Starting processing with 15

## Evaluate Metrics

Let's use the huggingface `evaluate` package to compute the performance of the baseline.


In [24]:
from evaluate import load

# Load evaluation metrics from the `evaluate` library
accuracy = load("accuracy")
precision = load("precision")
recall = load("recall")
f1 = load("f1")

# Define mapping from string labels to integer IDs
# Defined in Cell 11 in anli_baseline.ipynb
label2id = {"entailment": 0, "neutral": 1, "contradiction": 2}

# Extract predicted labels from ordered_llm_predictions (second element in each tuple)
predicted_labels = [label2id[label.lower()] for uid, label in ordered_llm_predictions]

# Extract gold labels from test_r3, slice to match predictions length to be safe
gold_labels = test_r3['label'][:len(predicted_labels)]


# Compute all metrics using integer IDs for predicted and gold labels
acc_result = accuracy.compute(predictions=predicted_labels, references=gold_labels)
prec_result = precision.compute(predictions=predicted_labels, references=gold_labels, average="weighted")
rec_result = recall.compute(predictions=predicted_labels, references=gold_labels, average="weighted")
f1_result = f1.compute(predictions=predicted_labels, references=gold_labels, average="weighted")

# Print out the evaluation results
print("Accuracy:", acc_result["accuracy"])
print("Precision:", prec_result["precision"])
print("Recall:", rec_result["recall"])
print("F1:", f1_result["f1"])


Accuracy: 0.69
Precision: 0.7035973636189568
Recall: 0.69
F1: 0.6941264745053273


# TASK 1.3

## Your Turn

Compute the classification metrics on the baseline LLM model on each test section of the ANLI dataset for samples that have a non-empty 'reason' field.

You also must show a comparison between the DeBERTa baseline model and this LLM baseline model. The comparison metric should compute the agreement between the two models:
* On how many samples they are both correct [Correct]
* On how many samples Model1 is correct and Model2 is incorrect [Correct1]
* On how many samples Model1 is incorrect and Model2 is correct [Correct2]
* On how many samples both are incorrect [Incorrect]

# NOTE FOR MICHAEL: 

1) in the assignment instructions on git , it says "Evaluate the model on the "test_r3" partition of the ANLI dataset". not on EACH test parition.
So we will compare only to "test_r3" parition.

2) we define DeBERTa baseline model as "Model 1" and the LLM baseline model as "Model 2"

In [25]:
%store -r pred_test_r3

In [26]:
# Model 1 predictions and gold labels (from pred_test_r3)
model1_preds = pred_test_r3[:len(ordered_llm_predictions)]

# Model 2 predictions from ordered_llm_predictions
model2_preds = ordered_llm_predictions

n = len(model2_preds)

both_correct = 0
only_model1_correct = 0
only_model2_correct = 0
both_incorrect = 0

for example, (uid, pred2) in zip(model1_preds, model2_preds):
    gold = example['gold_label']
    pred1 = example['pred_label']
    
    model1_correct = (pred1 == gold)
    model2_correct = (pred2 == gold)
    
    if model1_correct and model2_correct:
        both_correct += 1
    elif model1_correct and not model2_correct:
        only_model1_correct += 1
    elif not model1_correct and model2_correct:
        only_model2_correct += 1
    else:
        both_incorrect += 1

print(f"Both models correct on {both_correct} samples ({both_correct / n * 100:.2f}%).")
print(f"Only Model 1 correct on {only_model1_correct} samples ({only_model1_correct / n * 100:.2f}%).")
print(f"Only Model 2 correct on {only_model2_correct} samples ({only_model2_correct / n * 100:.2f}%).")
print(f"Both models incorrect on {both_incorrect} samples ({both_incorrect / n * 100:.2f}%).")


Both models correct on 430 samples (35.83%).
Only Model 1 correct on 147 samples (12.25%).
Only Model 2 correct on 398 samples (33.17%).
Both models incorrect on 225 samples (18.75%).


# Task 1.4