# ANLI Baseline with LLM

You have to implement in this notebook a baseline for ANLI classification using an LLM.
This baseline must be implemented using DSPy.



In [21]:
# Configure the DSPy environment with the language model - for grok the parameters must be:
# env variable should be in os.environ['XAI_API_KEY']
# "xai/grok-3-mini"
from dotenv import load_dotenv
import os
import dspy
load_dotenv("grok_key.ini") 
lm = dspy.LM('xai/grok-3-mini', api_key=os.environ['XAI_API_KEY'])

# for ollama 
# lm = dspy.LM('ollama_chat/devstral', api_base='http://localhost:11434', api_key='')
dspy.configure(lm=lm)

In [22]:
## Implement the DSPy classifier program.

from typing import Literal
from tqdm import tqdm
import dspy

# Signature for the NLI task
class NLISignature(dspy.Signature):
    """
    Classify the relationship between the premise and hypothesis 
    to a label: entailment, neutral or contradiction.
    """
    premise: str = dspy.InputField()
    hypothesis: str = dspy.InputField()
    label: Literal['entailment', 'neutral', 'contradiction'] = dspy.OutputField()

# A class for Parallel processing with progress display
class NLIClassifier(dspy.Module):
    def __init__(self, predictor_module: dspy.Module, batch_size: int = 20, num_threads: int = 8):
        super().__init__()
        self.predictor = predictor_module  # Predict, ChainOfThought, etc.
        self.batch_size = batch_size
        self.num_threads = num_threads

    def forward(self, examples: dspy.Example) -> list[dspy.Prediction]:
        # Display progress with tqdm while processing
        results = []
        for i in tqdm(range(0, len(examples), self.batch_size), desc="Processing"):
            sub_batch = examples[i:i + self.batch_size]
            processed = self.predictor.batch( # perform batch processing
                sub_batch,
                num_threads=self.num_threads
            )
            results.extend(processed)

        return results

## Load ANLI dataset

In [23]:
from datasets import load_dataset

dataset = load_dataset("facebook/anli")
dataset = dataset.filter(lambda x: x['reason'] != None and x['reason'] != "")

In [24]:
dataset

DatasetDict({
    train_r1: Dataset({
        features: ['uid', 'premise', 'hypothesis', 'label', 'reason'],
        num_rows: 2923
    })
    dev_r1: Dataset({
        features: ['uid', 'premise', 'hypothesis', 'label', 'reason'],
        num_rows: 1000
    })
    test_r1: Dataset({
        features: ['uid', 'premise', 'hypothesis', 'label', 'reason'],
        num_rows: 1000
    })
    train_r2: Dataset({
        features: ['uid', 'premise', 'hypothesis', 'label', 'reason'],
        num_rows: 4861
    })
    dev_r2: Dataset({
        features: ['uid', 'premise', 'hypothesis', 'label', 'reason'],
        num_rows: 1000
    })
    test_r2: Dataset({
        features: ['uid', 'premise', 'hypothesis', 'label', 'reason'],
        num_rows: 1000
    })
    train_r3: Dataset({
        features: ['uid', 'premise', 'hypothesis', 'label', 'reason'],
        num_rows: 13375
    })
    dev_r3: Dataset({
        features: ['uid', 'premise', 'hypothesis', 'label', 'reason'],
        num_rows: 1200


## Evaluate Metrics

Let's use the huggingface `evaluate` package to compute the performance of the baseline.


In [25]:
from evaluate import load

accuracy = load("accuracy")
precision = load("precision")
recall = load("recall")
f1 = load("f1")


In [26]:
import evaluate
clf_metrics = evaluate.combine(["accuracy", "f1", "precision", "recall"])

In [27]:
clf_metrics.compute(predictions=[0, 1, 0], references=[0, 1, 1])

{'accuracy': 0.6666666666666666,
 'f1': 0.6666666666666666,
 'precision': 1.0,
 'recall': 0.5}

## Your Turn

Compute the classification metrics on the baseline LLM model on each test section of the ANLI dataset for samples that have a non-empty 'reason' field.

You also must show a comparison between the DeBERTa baseline model and this LLM baseline model. The comparison metric should compute the agreement between the two models:
* On how many samples they are both correct [Correct]
* On how many samples Model1 is correct and Model2 is incorrect [Correct1]
* On how many samples Model1 is incorrect and Model2 is correct [Correct2]
* On how many samples both are incorrect [Incorrect]

first we will optimize the model on a train set from "dev_r3":

In [28]:
# prepare the training set
import random 

preprocessed_examples = [
    dspy.Example(
        premise=row["premise"],
        hypothesis=row["hypothesis"],
        label=row["label"]
    ).with_inputs("premise", "hypothesis")
    for row in dataset['dev_r3']  # Use the 'dev_r3' split for training
]

train_set_size = 40 # tradeoff between quality and speed after testing, permitted range is 20-100
trainset = random.sample(preprocessed_examples, train_set_size)  # pick examples randomly for training to avoid bias
print(f"Total examples: {len(trainset)}")

Total examples: 40


In [29]:
# Do the optimization using few-shot learning - only in task 1.4 we will use CoT

from dspy.teleprompt import BootstrapFewShot

label2id = {"entailment": 0, "neutral": 1, "contradiction": 2}

def exact_match(example, pred, trace=None):
    # Ensure both labels are strings and lowercase
    ex_label = str(example.label).strip().lower()
    pred_label = str(pred.label).strip().lower()

    # In case example.label is already an int, use reverse mapping
    if ex_label.isdigit():
        id2label = {v: k for k, v in label2id.items()}
        ex_label = id2label[int(ex_label)]

    return label2id.get(pred_label) == label2id.get(ex_label)

def compute_metrics(preds, golds):
    return {
        "accuracy": accuracy.compute(predictions=preds, references=golds)["accuracy"],
        "precision": precision.compute(predictions=preds, references=golds, average="macro")["precision"],
        "recall": recall.compute(predictions=preds, references=golds, average="macro")["recall"],
        "f1": f1.compute(predictions=preds, references=golds, average="macro")["f1"],
    }

model_simple = dspy.Predict(NLISignature)
bootstrap = BootstrapFewShot(metric=exact_match)
optimized_bootstrap = bootstrap.compile(student=model_simple, trainset=trainset)

 15%|█▌        | 6/40 [00:37<03:35,  6.33s/it]

Bootstrapped 4 full traces after 6 examples for up to 1 rounds, amounting to 6 attempts.





In [None]:
# run the optimized model on 'test_r3' split
testset_with_labels = [
    dspy.Example(
        premise=row["premise"],
        hypothesis=row["hypothesis"],
        label=row["label"]
    ).with_inputs("premise", "hypothesis")
    for row in dataset['test_r3']  # Use the 'test_r3' split for evaluation
]

testset_no_labels = [
    dspy.Example(
        premise=row["premise"],
        hypothesis=row["hypothesis"]
    ).with_inputs("premise", "hypothesis")
    for row in dataset['test_r3']  # Use the 'test_r3' split for evaluation
]


program = NLIClassifier(optimized_bootstrap)
predictions = program(testset_no_labels)
pred_labels = [label2id[pred.label] for pred in predictions]

Processing:   0%|          | 0/60 [00:00<?, ?it/s]

Processed 20 / 20 examples: 100%|██████████| 20/20 [00:19<00:00,  1.05it/s]

Processing:   2%|▏         | 1/60 [00:19<18:45, 19.08s/it]


Processed 20 / 20 examples: 100%|██████████| 20/20 [00:14<00:00,  1.35it/s]

Processing:   3%|▎         | 2/60 [00:33<16:01, 16.58s/it]


Processed 20 / 20 examples: 100%|██████████| 20/20 [00:13<00:00,  1.51it/s]

Processing:   5%|▌         | 3/60 [00:47<14:25, 15.18s/it]


Processed 20 / 20 examples: 100%|██████████| 20/20 [00:12<00:00,  1.67it/s]

Processing:   7%|▋         | 4/60 [00:59<13:00, 13.94s/it]


Processed 20 / 20 examples: 100%|██████████| 20/20 [00:14<00:00,  1.38it/s]

Processing:   8%|▊         | 5/60 [01:14<12:59, 14.16s/it]


Processed 20 / 20 examples: 100%|██████████| 20/20 [00:12<00:00,  1.59it/s]

Processing:  10%|█         | 6/60 [01:26<12:19, 13.69s/it]


Processed 20 / 20 examples: 100%|██████████| 20/20 [00:10<00:00,  1.88it/s]

Processing:  12%|█▏        | 7/60 [01:37<11:13, 12.70s/it]


Processed 20 / 20 examples: 100%|██████████| 20/20 [00:14<00:00,  1.38it/s]

Processing:  13%|█▎        | 8/60 [01:51<11:30, 13.28s/it]


Processed 20 / 20 examples: 100%|██████████| 20/20 [00:14<00:00,  1.40it/s]

Processing:  15%|█▌        | 9/60 [02:06<11:34, 13.62s/it]


Processed 20 / 20 examples: 100%|██████████| 20/20 [00:13<00:00,  1.49it/s]

Processing:  17%|█▋        | 10/60 [02:19<11:18, 13.56s/it]


Processed 20 / 20 examples: 100%|██████████| 20/20 [00:16<00:00,  1.19it/s]

Processing:  18%|█▊        | 11/60 [02:36<11:53, 14.57s/it]


Processed 20 / 20 examples: 100%|██████████| 20/20 [00:12<00:00,  1.63it/s]

Processing:  20%|██        | 12/60 [02:49<11:07, 13.90s/it]


Processed 20 / 20 examples: 100%|██████████| 20/20 [00:11<00:00,  1.70it/s]

Processing:  22%|██▏       | 13/60 [03:00<10:23, 13.26s/it]


Processed 20 / 20 examples: 100%|██████████| 20/20 [00:12<00:00,  1.63it/s]

Processing:  23%|██▎       | 14/60 [03:13<09:56, 12.97s/it]


Processed 20 / 20 examples: 100%|██████████| 20/20 [00:23<00:00,  1.16s/it]

Processing:  25%|██▌       | 15/60 [03:36<12:02, 16.06s/it]


  0%|          | 0/20 [00:00<?, ?it/s]

# TASK 1.3 Answers

a) Compute the classification metrics on the baseline LLM model on each test section of the ANLI dataset for samples that have a non-empty 'reason' field.

In [30]:
# use compute_metrics to evaluate the model and print the results
gold_labels = [ex.label for ex in testset_with_labels]
metrics = compute_metrics(pred_labels, gold_labels)

# Print the metrics
print("@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@")
print("Model scores:")
print(f"F1 score: {metrics['f1']:.4f}")
print(f"Accuracy: {metrics['accuracy']:.4f}")
print(f"Precision: {metrics['precision']:.4f}")
print(f"Recall: {metrics['recall']:.4f}")
print("@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@")

NameError: name 'testset_with_labels' is not defined

Compare the results with the baseline and provide agreement metrics between the two models.

In [23]:
# how many samples they are both correct
%store -r pred_test_r3
optimized_llm_predictions = predictions
non_llm_predictions = pred_test_r3 # DeBERTa_v3_predictions
TEST_SIZE = len(optimized_llm_predictions)
# gold_labels = [label2id[row['gold_label']] for row in pred_test_r3]


# on how many samples both models are correct
correct_predictions = sum(
    1 for row, llm_pred in zip(non_llm_predictions,optimized_llm_predictions)
    if (llm_pred.label == row['gold_label']) and (row['gold_label'] == row['pred_label'])
)

print(f"Both models are correct on {correct_predictions} out of {TEST_SIZE} samples.")
print(f"Both models are correct on {correct_predictions / TEST_SIZE * 100:.2f}% of the samples.")
print("\n")

# On how many samples llm is correct and DeBERTa_v3_ is incorrect
llm_correct_deberta_incorrect = sum(
    1 for row, llm_pred in zip(non_llm_predictions,optimized_llm_predictions)
    if (llm_pred.label == row['gold_label']) and (row['gold_label'] != row['pred_label'])
)
print(f"LLM is correct and DeBERTa_v3 is incorrect on {llm_correct_deberta_incorrect} out of {TEST_SIZE} samples.")
print(f"LLM is correct and DeBERTa_v3 is incorrect on {llm_correct_deberta_incorrect / TEST_SIZE * 100:.2f}% of the samples.")
print("\n")

# On how many samples DeBERTa_v3 is correct and llm is incorrect
deberta_correct_llm_incorrect = sum(
    1 for row, llm_pred in zip(non_llm_predictions,optimized_llm_predictions)
    if (row['pred_label'] == row['gold_label']) and (llm_pred.label != row['gold_label'])
)
print(f"DeBERTa_v3 is correct and LLM is incorrect on {deberta_correct_llm_incorrect} out of {TEST_SIZE} samples.")
print(f"DeBERTa_v3 is correct and LLM is incorrect on {deberta_correct_llm_incorrect / TEST_SIZE * 100:.2f}% of the samples.")
print("\n")


# on how many samples both models are incorrect
both_incorrect = sum(
    1 for row, llm_pred in zip(non_llm_predictions,optimized_llm_predictions)
    if (llm_pred.label != row['gold_label']) and (row['pred_label'] != row['gold_label'])
)
print(f"Both models are incorrect on {both_incorrect} out of {TEST_SIZE} samples.")
print(f"Both models are incorrect on {both_incorrect / TEST_SIZE * 100:.2f}% of the samples.")
print("\n")

Both models are correct on 441 out of 1200 samples.
Both models are correct on 36.75% of the samples.


LLM is correct and DeBERTa_v3 is incorrect on 419 out of 1200 samples.
LLM is correct and DeBERTa_v3 is incorrect on 34.92% of the samples.


DeBERTa_v3 is correct and LLM is incorrect on 136 out of 1200 samples.
DeBERTa_v3 is correct and LLM is incorrect on 11.33% of the samples.


Both models are incorrect on 204 out of 1200 samples.
Both models are incorrect on 17.00% of the samples.


