# ANLI Baseline with LLM

You have to implement in this notebook a baseline for ANLI classification using an LLM.
This baseline must be implemented using DSPy.



In [5]:
# Configure the DSPy environment with the language model - for grok the parameters must be:
# env variable should be in os.environ['XAI_API_KEY']
# "xai/grok-3-mini"
import os
import dspy

from dotenv import load_dotenv
load_dotenv("grok_key.ini") 
lm = dspy.LM('xai/grok-3-mini', api_key=os.environ['XAI_API_KEY'])
# for ollama 
# lm = dspy.LM('ollama_chat/devstral', api_base='http://localhost:11434', api_key='')
dspy.configure(lm=lm)

In [6]:
from typing import Literal

## Implement the DSPy classifier program.
class ANLIClassifier(dspy.Signature):
    premise: str = dspy.InputField()
    hypothesis: str = dspy.InputField()
    label: Literal['entailment', 'neutral', 'contradiction'] = dspy.OutputField()

classier = dspy.Predict(ANLIClassifier)

def classify(premise, hypothesis):
    return classier(premise=premise, hypothesis=hypothesis).label

## Load ANLI dataset

In [7]:
from datasets import load_dataset

dataset = load_dataset("facebook/anli")
dataset = dataset.filter(lambda x: x['reason'] != None and x['reason'] != "")

In [8]:
dataset

DatasetDict({
    train_r1: Dataset({
        features: ['uid', 'premise', 'hypothesis', 'label', 'reason'],
        num_rows: 2923
    })
    dev_r1: Dataset({
        features: ['uid', 'premise', 'hypothesis', 'label', 'reason'],
        num_rows: 1000
    })
    test_r1: Dataset({
        features: ['uid', 'premise', 'hypothesis', 'label', 'reason'],
        num_rows: 1000
    })
    train_r2: Dataset({
        features: ['uid', 'premise', 'hypothesis', 'label', 'reason'],
        num_rows: 4861
    })
    dev_r2: Dataset({
        features: ['uid', 'premise', 'hypothesis', 'label', 'reason'],
        num_rows: 1000
    })
    test_r2: Dataset({
        features: ['uid', 'premise', 'hypothesis', 'label', 'reason'],
        num_rows: 1000
    })
    train_r3: Dataset({
        features: ['uid', 'premise', 'hypothesis', 'label', 'reason'],
        num_rows: 13375
    })
    dev_r3: Dataset({
        features: ['uid', 'premise', 'hypothesis', 'label', 'reason'],
        num_rows: 1200


## Evaluate Metrics

Let's use the huggingface `evaluate` package to compute the performance of the baseline.


In [9]:
from evaluate import load

accuracy = load("accuracy")
precision = load("precision")
recall = load("recall")
f1 = load("f1")


In [10]:
import evaluate
clf_metrics = evaluate.combine(["accuracy", "f1", "precision", "recall"])

In [11]:
clf_metrics.compute(predictions=[0, 1, 0], references=[0, 1, 1])

{'accuracy': 0.6666666666666666,
 'f1': 0.6666666666666666,
 'precision': 1.0,
 'recall': 0.5}

## Your Turn

Compute the classification metrics on the baseline LLM model on each test section of the ANLI dataset for samples that have a non-empty 'reason' field.

You also must show a comparison between the DeBERTa baseline model and this LLM baseline model. The comparison metric should compute the agreement between the two models:
* On how many samples they are both correct [Correct]
* On how many samples Model1 is correct and Model2 is incorrect [Correct1]
* On how many samples Model1 is incorrect and Model2 is correct [Correct2]
* On how many samples both are incorrect [Incorrect]

In [34]:
# Evaluate the llm model on a dataset section
from tqdm import tqdm

def evaluate_llm_on_section(section):
    results = []
    label_names = ["entailment", "neutral", "contradiction"]

    data = dataset[section]

    for example in tqdm(data):
        premise = example['premise']
        hypothesis = example['hypothesis']
        prediction = classify(premise, hypothesis)

        results.append({
            'premise': premise,
            'hypothesis': hypothesis,
            'pred_label': prediction,
            'gold_label': label_names[example['label']]
        })

    # Calculate metrics
    predictions = [result['pred_label'] for result in results]
    references = [result['gold_label'] for result in results]

    label_to_int = {"entailment": 0, "neutral": 1, "contradiction": 2}
    predictions_int = [label_to_int[pred] for pred in predictions]
    references_int = [label_to_int[ref] for ref in references]

    accuracy_score = accuracy.compute(predictions=predictions_int, references=references_int)['accuracy']
    f1_score = f1.compute(predictions=predictions_int, references=references_int, average='macro')['f1']
    precision_score = precision.compute(predictions=predictions_int, references=references_int, average='macro')['precision']
    recall_score = recall.compute(predictions=predictions_int, references=references_int, average='macro')['recall']

    print(f"Results for section {section}:")
    print(f"\tAccuracy: {accuracy_score:.3f}")
    print(f"\tF1: {f1_score:.3f}")
    print(f"\tPrecision: {precision_score:.3f}")
    print(f"\tRecall: {recall_score:.3f}")
    print("-" * 50)

    return results

In [36]:
# Evaluate the model on each test section
llm_result = {}
sections = ['test_r1', 'test_r2', 'test_r3']

for section in sections:
    llm_result[section] = evaluate_llm_on_section(section)

100%|██████████| 1000/1000 [1:04:32<00:00,  3.87s/it]


Results for section test_r1:
	Accuracy: 0.824
	F1: 0.826
	Precision: 0.839
	Recall: 0.824
--------------------------------------------------


100%|██████████| 1000/1000 [1:14:55<00:00,  4.50s/it]


Results for section test_r2:
	Accuracy: 0.759
	F1: 0.761
	Precision: 0.786
	Recall: 0.759
--------------------------------------------------


100%|██████████| 1200/1200 [1:15:36<00:00,  3.78s/it]

Results for section test_r3:
	Accuracy: 0.691
	F1: 0.694
	Precision: 0.744
	Recall: 0.691
--------------------------------------------------





In [22]:
# Load DeBERTa model functions
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

model_name = "MoritzLaurer/DeBERTa-v3-base-mnli-fever-anli"
deberta_tokenizer = AutoTokenizer.from_pretrained(model_name)
deberta_model = AutoModelForSequenceClassification.from_pretrained(model_name)

def evaluate_deberta(premise, hypothesis):
    input_data = deberta_tokenizer(premise, hypothesis, truncation=True, return_tensors="pt")
    output = deberta_model(input_data["input_ids"].to(device))
    prediction = torch.softmax(output["logits"][0], -1).tolist()
    label_names = ["entailment", "neutral", "contradiction"]
    prediction_dict = {name: round(float(pred) * 100, 1) for pred, name in zip(prediction, label_names)}
    
    if prediction_dict["entailment"] > prediction_dict["contradiction"] and prediction_dict["entailment"] > prediction_dict["neutral"]:
        return "entailment"
    elif prediction_dict["contradiction"] > prediction_dict["entailment"]:
        return "contradiction"
    else:
        return "neutral"

In [23]:
# Evaluate the DeBERTa model on a dataset section
def evaluate_deberta_on_section(section):
    results = []
    label_names = ["entailment", "neutral", "contradiction"]

    data = dataset[section]

    for example in tqdm(data):
        premise = example['premise']
        hypothesis = example['hypothesis']
        prediction = evaluate_deberta(premise, hypothesis)

        results.append({
            'premise': premise,
            'hypothesis': hypothesis,
            'pred_label': prediction,
            'gold_label': label_names[example['label']]
        })

    # Calculate metrics
    predictions = [result['pred_label'] for result in results]
    references = [result['gold_label'] for result in results]

    label_to_int = {"entailment": 0, "neutral": 1, "contradiction": 2}
    predictions_int = [label_to_int[pred] for pred in predictions]
    references_int = [label_to_int[ref] for ref in references]

    accuracy_score = accuracy.compute(predictions=predictions_int, references=references_int)['accuracy']
    f1_score = f1.compute(predictions=predictions_int, references=references_int, average='macro')['f1']
    precision_score = precision.compute(predictions=predictions_int, references=references_int, average='macro')['precision']
    recall_score = recall.compute(predictions=predictions_int, references=references_int, average='macro')['recall']

    print(f"Results for section {section}:")
    print(f"\tAccuracy: {accuracy_score:.3f}")
    print(f"\tF1: {f1_score:.3f}")
    print(f"\tPrecision: {precision_score:.3f}")
    print(f"\tRecall: {recall_score:.3f}")
    print("-" * 50)

    return results

In [25]:
debrata_result = {}
for section in sections:
    debrata_result[section] = evaluate_deberta_on_section(section)

100%|██████████| 50/50 [00:15<00:00,  3.28it/s]


Results for section test_r1:
	Accuracy: 0.700
	F1: 0.677
	Precision: 0.713
	Recall: 0.694
--------------------------------------------------


100%|██████████| 50/50 [00:16<00:00,  2.99it/s]


Results for section test_r2:
	Accuracy: 0.480
	F1: 0.464
	Precision: 0.502
	Recall: 0.470
--------------------------------------------------


100%|██████████| 50/50 [00:17<00:00,  2.78it/s]

Results for section test_r3:
	Accuracy: 0.460
	F1: 0.248
	Precision: 0.354
	Recall: 0.267
--------------------------------------------------



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [27]:
# Compare the LLM results with DeBERTa
def compare_results(llm_results, deberta_results):
    results = {}
    sections = ['test_r1', 'test_r2', 'test_r3']
    
    for section in sections:
        both_correct = 0
        llm_correct = 0
        deberta_correct = 0
        both_incorrect = 0
        total = 0

        llm_section_results = llm_results[section]
        deberta_section_results = deberta_results[section]
        
        for llm_result, deberta_result in zip(llm_section_results, deberta_section_results):
            total += 1

            llm_pred = llm_result['pred_label']
            deberta_pred = deberta_result['pred_label']
            gold = llm_result['gold_label']

            if llm_pred == gold and deberta_pred == gold:
                both_correct += 1
            elif llm_pred == gold and deberta_pred != gold:
                llm_correct += 1
            elif llm_pred != gold and deberta_pred == gold:
                deberta_correct += 1
            else:
                both_incorrect += 1

        results[section] = {
            'both_correct': both_correct,
            'llm_correct': llm_correct,
            'deberta_correct': deberta_correct,
            'both_incorrect': both_incorrect,
            'total': total
        }

    return results

comparison = compare_results(llm_result, debrata_result)

In [30]:
def compare_all_sections(comparison):
    for section in sections:
        result = comparison[section]
        print(f"Section: {section}")
        print(f"\tBoth Correct: {result['both_correct']} ({result['both_correct'] / result['total']:.2%})")
        print(f"\tLLM Correct: {result['llm_correct']} ({result['llm_correct'] / result['total']:.2%})")
        print(f"\tDeBERTa Correct: {result['deberta_correct']} ({result['deberta_correct'] / result['total']:.2%})")
        print(f"\tBoth Incorrect: {result['both_incorrect']} ({result['both_incorrect'] / result['total']:.2%})")
        print("-" * 50)

    print("-" * 50)
    print("Overall Results:")
    total_both_correct = sum(result['both_correct'] for result in comparison.values())  
    total_llm_correct = sum(result['llm_correct'] for result in comparison.values())
    total_deberta_correct = sum(result['deberta_correct'] for result in comparison.values())
    total_both_incorrect = sum(result['both_incorrect'] for result in comparison.values())
    total = sum(result['total'] for result in comparison.values())  
    print(f"\tBoth Correct: {total_both_correct} ({total_both_correct / total:.2%})")
    print(f"\tLLM Correct: {total_llm_correct} ({total_llm_correct / total:.2%})")
    print(f"\tDeBERTa Correct: {total_deberta_correct} ({total_deberta_correct / total:.2%})")
    print(f"\tBoth Incorrect: {total_both_incorrect} ({total_both_incorrect / total:.2%})")

compare_all_sections(comparison)

Section: test_r1
	Both Correct: 31 (62.00%)
	LLM Correct: 10 (20.00%)
	DeBERTa Correct: 4 (8.00%)
	Both Incorrect: 5 (10.00%)
--------------------------------------------------
Section: test_r2
	Both Correct: 19 (38.00%)
	LLM Correct: 22 (44.00%)
	DeBERTa Correct: 5 (10.00%)
	Both Incorrect: 4 (8.00%)
--------------------------------------------------
Section: test_r3
	Both Correct: 14 (28.00%)
	LLM Correct: 11 (22.00%)
	DeBERTa Correct: 9 (18.00%)
	Both Incorrect: 16 (32.00%)
--------------------------------------------------
--------------------------------------------------
Overall Results:
	Both Correct: 64 (42.67%)
	LLM Correct: 43 (28.67%)
	DeBERTa Correct: 18 (12.00%)
	Both Incorrect: 25 (16.67%)
