# ANLI Baseline with LLM

You have to implement in this notebook a baseline for ANLI classification using an LLM.
This baseline must be implemented using DSPy.



In [2]:
# Configure the DSPy environment with the language model - for grok the parameters must be:
# env variable should be in os.environ['XAI_API_KEY']
# "xai/grok-3-mini"

import re
from concurrent.futures import ThreadPoolExecutor, as_completed
from itertools import islice
from evaluate import load
from datasets import load_dataset


import os
import dspy


lm = dspy.LM('xai/grok-3-mini', api_key=os.environ['XAI_API_KEY'])
# for ollama 
# lm = dspy.LM('ollama_chat/devstral', api_base='http://localhost:11434', api_key='')
dspy.configure(lm=lm)

# Utility functions

1) test_nli_predictor -> run the experiment with specific pipeline on specific dataset

2) evaluate_predictions -> summarize the metrics of the experiments using "evaluate" package"

In [3]:
class classification_response:
    def __init__(self, uid = None, label = None, reason = None):
        self.uid = uid
        self.label = label
        self.reason = reason

In [4]:
def test_nli_predictor(
    dataset,
    split_name,
    model,
    predictor_class,
    example_extractor,
    batch_size=10,
    max_workers=6,
    max_examples=100,
    uid_key='uid',
):
    data_split = dataset[split_name]

    # Iterate only up to max_examples without slicing
    subset_iter = islice(data_split, max_examples)
    
    examples = []
    uids = []
    for row in subset_iter:
        examples.append(example_extractor(row))
        uids.append(row[uid_key])

    predictor = predictor_class(model=model, batch_size=batch_size, max_workers=max_workers)
    predictions = predictor(examples)

    results = list(zip(uids, [p[0] for p in predictions], [p[1] for p in predictions]))
    return results

def extract_example(row):
    return dspy.Example(premise=row["premise"], hypothesis=row["hypothesis"])


In [5]:
def evaluate_predictions(
    ordered_llm_predictions,  # list of (uid, label, ...) tuples, label is str
    gold_labels,             # list or dataset column of gold labels (int or str)
    label2id,                # dict mapping string labels to int ids
    gold_label_key=None      # if gold_labels is a dataset dict, the key to use
):
    """
    Evaluate classification predictions and print accuracy, precision, recall, and F1.

    Args:
        ordered_llm_predictions: List of tuples (uid, label, ...) or (uid, label)
        gold_labels: List or Dataset column of gold labels (ints or strings)
        label2id: Dict mapping label strings (case insensitive) to ints
        gold_label_key: Optional, if gold_labels is a dataset dict, the key to use

    Prints:
        Accuracy, Precision, Recall, and F1 scores.
    """
    # Load metrics
    accuracy = load("accuracy")
    precision = load("precision")
    recall = load("recall")
    f1 = load("f1")

    # Extract predicted labels mapped to int IDs
    predicted_labels = [
        label2id.get(item[1].lower(), -1)  # -1 for unknown labels
        for item in ordered_llm_predictions
    ]

    # Extract gold labels
    if gold_label_key and isinstance(gold_labels, dict):
        golds = gold_labels[gold_label_key]
    else:
        golds = gold_labels

    # Trim to equal length
    min_len = min(len(predicted_labels), len(golds))
    predicted_labels = predicted_labels[:min_len]
    golds = golds[:min_len]

    # Compute metrics
    acc_result = accuracy.compute(predictions=predicted_labels, references=golds)
    prec_result = precision.compute(predictions=predicted_labels, references=golds, average="weighted")
    rec_result = recall.compute(predictions=predicted_labels, references=golds, average="weighted")
    f1_result = f1.compute(predictions=predicted_labels, references=golds, average="weighted")

    # Print results
    print("Evaluation results:")
    print(f"  Accuracy:  {acc_result.get('accuracy')}")
    print(f"  Precision: {prec_result.get('precision')}")
    print(f"  Recall:    {rec_result.get('recall')}")
    print(f"  F1:        {f1_result.get('f1')}")


## Load ANLI dataset

In [6]:
dataset = load_dataset("facebook/anli")
dataset = dataset.filter(lambda x: x['reason'] != None and x['reason'] != "")

In [7]:
dataset

DatasetDict({
    train_r1: Dataset({
        features: ['uid', 'premise', 'hypothesis', 'label', 'reason'],
        num_rows: 2923
    })
    dev_r1: Dataset({
        features: ['uid', 'premise', 'hypothesis', 'label', 'reason'],
        num_rows: 1000
    })
    test_r1: Dataset({
        features: ['uid', 'premise', 'hypothesis', 'label', 'reason'],
        num_rows: 1000
    })
    train_r2: Dataset({
        features: ['uid', 'premise', 'hypothesis', 'label', 'reason'],
        num_rows: 4861
    })
    dev_r2: Dataset({
        features: ['uid', 'premise', 'hypothesis', 'label', 'reason'],
        num_rows: 1000
    })
    test_r2: Dataset({
        features: ['uid', 'premise', 'hypothesis', 'label', 'reason'],
        num_rows: 1000
    })
    train_r3: Dataset({
        features: ['uid', 'premise', 'hypothesis', 'label', 'reason'],
        num_rows: 13375
    })
    dev_r3: Dataset({
        features: ['uid', 'premise', 'hypothesis', 'label', 'reason'],
        num_rows: 1200


# TASK 1.3



Below we define the pipline for task 1.3 , using a DSPy-hybrid approach.
1) We use DSPy as a wrapper - because it provides a convenient layout
2) Yet we do Manual prompting, since DSPy was unable to batch multiple examples into 1 API, increasing time & usage dramatically.
3) Note: DSPy is just a wrapper for prompting. we don't miss a core element by taking this hybrid approach. 
see https://www.dbreunig.com/2024/12/12/pipelines-prompt-optimization-with-dspy.html


In [8]:
class BatchedNLIPredictor(dspy.Module):
    """
    DSPy program to classify NLI examples in batches using a language model.
    """
    def __init__(self, model, batch_size=15, max_workers=6):
        super().__init__()
        self.model = model
        self.batch_size = batch_size
        self.max_workers = max_workers

    def _process_chunk(self, chunk_examples, batch_index):
        """Helper to process a single chunk of examples."""
        print(f"[Batch {batch_index}] Starting processing with {len(chunk_examples)} examples")

        # Build a single prompt for the chunk
        prompt = (
            "Classify the relationship between the hypothesis and premise: "
            "entailment / neutral / contradiction. **Provide a one-word answer**.\n\n"
        )
        for idx, ex in enumerate(chunk_examples, start=1):
            prompt += (
                f"Example {idx}:\n"
                f"Premise: {ex.premise}\n"
                f"Hypothesis: {ex.hypothesis}\n"
                f"Answer (entailment/neutral/contradiction):\n"
            )

        # Single LLM call
        response = self.model(prompt)
        response_text = "\n".join(response) if isinstance(response, list) else str(response)

        # Extract predictions
        predictions = []
        for line in response_text.splitlines():
            m = re.search(r"(entailment|neutral|contradiction)", line, re.IGNORECASE)
            if m:
                # predictions.append(m.group(1).lower())
                predictions.append((m.group(1).lower(), None))

        # Pad any missing predictions
        while len(predictions) < len(chunk_examples):
            predictions.append(("unknown",None))

        print(f"[Batch {batch_index}] Finished processing")

        return predictions

    def forward(self, examples):
        """
        Main pipeline: 
        - Splits examples into batches
        - Runs them in parallel
        - Returns predictions in the original order
        """
        if not examples:
            return []

        # Split into batches
        chunks = [
            examples[i:i+self.batch_size]
            for i in range(0, len(examples), self.batch_size)
        ]

        results_by_index = {}
        with ThreadPoolExecutor(max_workers=self.max_workers) as executor:
            # Submit each batch to executor
            futures = {
                executor.submit(self._process_chunk, chunk, idx): idx
                for idx, chunk in enumerate(chunks)
            }

            # Collect results
            for future in as_completed(futures):
                batch_index = futures[future]
                predictions = future.result()
                results_by_index[batch_index] = predictions

        # Flatten results in original order
        ordered_predictions = []
        for idx in sorted(results_by_index.keys()):
            ordered_predictions.extend(results_by_index[idx])

        return ordered_predictions


# NOTE FOR MICHAEL: 

1) in the assignment instructions on git , it says "Evaluate the model on the "test_r3" partition of the ANLI dataset". not on EACH test parition.
So we will compare only to "test_r3" parition.

2) we define DeBERTa baseline model as "Model 1" and the LLM baseline model as "Model 2"

Running the experiment

In [9]:
test_r3_llm_results = test_nli_predictor(
    dataset=dataset,
    split_name='test_r3',
    model=lm,
    predictor_class=BatchedNLIPredictor,
    example_extractor=extract_example,
    batch_size=15,
    max_workers=6,
    max_examples=1200,
    uid_key='uid'
)

[Batch 0] Starting processing with 15 examples
[Batch 1] Starting processing with 15 examples
[Batch 2] Starting processing with 15 examples
[Batch 3] Starting processing with 15 examples
[Batch 1] Finished processing
[Batch 4] Starting processing with 15 examples
[Batch 5] Starting processing with 15 examples
[Batch 6] Starting processing with 15 examples
[Batch 0] Finished processing
[Batch 7] Starting processing with 15 examples
[Batch 2] Finished processing
[Batch 8] Starting processing with 15 examples
[Batch 4] Finished processing
[Batch 9] Starting processing with 15 examples
[Batch 3] Finished processing
[Batch 10] Starting processing with 15 examples
[Batch 5] Finished processing
[Batch 11] Starting processing with 15 examples
[Batch 7] Finished processing
[Batch 12] Starting processing with 15 examples
[Batch 6] Finished processing
[Batch 13] Starting processing with 15 examples
[Batch 8] Finished processing
[Batch 14] Starting processing with 15 examples
[Batch 9] Finished p

Evaluating metrics

In [10]:
evaluate_predictions(
    ordered_llm_predictions=test_r3_llm_results,
    gold_labels=dataset['test_r3']['label'],
    label2id={"entailment": 0, "neutral": 1, "contradiction": 2}
)

Evaluation results:
  Accuracy:  0.69
  Precision: 0.7035973636189568
  Recall:    0.69
  F1:        0.6941264745053273


Compute the classification metrics on the baseline LLM model on each test section of the ANLI dataset for samples that have a non-empty 'reason' field.

You also must show a comparison between the DeBERTa baseline model and this LLM baseline model. The comparison metric should compute the agreement between the two models:
* On how many samples they are both correct [Correct]
* On how many samples Model1 is correct and Model2 is incorrect [Correct1]
* On how many samples Model1 is incorrect and Model2 is correct [Correct2]
* On how many samples both are incorrect [Incorrect]

In [11]:
# import the predictions from anli_baseline.ipynb
%store -r pred_test_r3

In [12]:
# Model 1: pred_test_r3 is a list of dicts with keys: pred_label, gold_label
model1_preds = pred_test_r3  # [{'pred_label': ..., 'gold_label': ...}, ...]

# Model 2: [(uid, label, reason), ...]
model2_preds = test_r3_llm_results

label2id = {"entailment": 0, "neutral": 1, "contradiction": 2}

n = min(len(model1_preds), len(model2_preds))

both_correct = 0
only_model1_correct = 0
only_model2_correct = 0
both_incorrect = 0


for example, (_uid, pred2, _reason) in zip(model1_preds[:n], model2_preds[:n]):
    gold = example['gold_label'].lower()
    pred1 = example['pred_label'].lower()
    pred2 = pred2.lower()

    model1_correct = (pred1 == gold)
    model2_correct = (pred2 == gold)

    if model1_correct and model2_correct:
        both_correct += 1
    elif model1_correct and not model2_correct:
        only_model1_correct += 1
    elif not model1_correct and model2_correct:
        only_model2_correct += 1
    else:
        both_incorrect += 1

print(f"Both models correct on {both_correct} samples ({both_correct / n * 100:.2f}%).")
print(f"Only Model 1 correct on {only_model1_correct} samples ({only_model1_correct / n * 100:.2f}%).")
print(f"Only Model 2 correct on {only_model2_correct} samples ({only_model2_correct / n * 100:.2f}%).")
print(f"Both models incorrect on {both_incorrect} samples ({both_incorrect / n * 100:.2f}%).")

Both models correct on 430 samples (35.83%).
Only Model 1 correct on 147 samples (12.25%).
Only Model 2 correct on 398 samples (33.17%).
Both models incorrect on 225 samples (18.75%).


# Task 1.4

Joint prompt pipeline: prompt the LLM to produce at once a CoT explanation and a label

In [13]:
class BatchedNLIJointPredictor(dspy.Module):
    def __init__(self, model, batch_size=15, max_workers=6):
        super().__init__()
        self.model = model
        self.batch_size = batch_size
        self.max_workers = max_workers

    def _process_chunk(self, examples, batch_index):
        print(f"[Batch {batch_index}] Starting processing with {len(examples)} examples")

        # Build joint prompt for the batch
        prompt = (
            "Classify the relationship between the hypothesis and premise. "
            "Respond for EACH example on a NEW LINE in the format:\n"
            "label || reason\n"
            "Label ∈ {entailment, neutral, contradiction}. "
            "Reason: 1-2 sentence justification.\n\n"
        )

        for idx, ex in enumerate(examples, start=1):
            prompt += (
                f"Example {idx}:\n"
                f"Premise: {ex.premise}\n"
                f"Hypothesis: {ex.hypothesis}\n"
            )

        prompt += (
            "\nNow output exactly "
            f"{len(examples)} lines, one per example, in order, like:\n"
            "entailment || <reason>\nneutral || <reason>\n..."
        )


        # Single API call for the batch
        response = self.model(prompt)
        #print(f"raw response: {response}")
        response_text = "\n".join(response) if isinstance(response, list) else str(response)

        # Parse the LLM output into (uid, label, reason)
        results = []
        lines = [line.strip() for line in response_text.splitlines() if line.strip()]

        for ex, line in zip(examples, lines):
            match = re.match(r"(?i)\s*(entailment|neutral|contradiction)\s*\|\|\s*(.*)", line)
            if match:
                label = match.group(1).lower()
                reason = match.group(2).strip()
            else:
                label, reason = "unknown", "no reasoning"
            results.append((label, reason))

        # Pad missing if LLM returned fewer lines
        while len(results) < len(examples):
            ex = examples[len(results)]
            results.append(("unknown", "no reasoning"))

        print(f"[Batch {batch_index}] Finished processing")
        return results

    def forward(self, examples):
        if not examples:
            return []

        # Split into batches
        chunks = [
            examples[i:i+self.batch_size]
            for i in range(0, len(examples), self.batch_size)
        ]

        results_by_index = {}
        with ThreadPoolExecutor(max_workers=self.max_workers) as executor:
            futures = {
                executor.submit(self._process_chunk, chunk, idx): idx
                for idx, chunk in enumerate(chunks)
            }
            for future in as_completed(futures):
                batch_index = futures[future]
                results_by_index[batch_index] = future.result()

        # Flatten results in original order
        ordered_results = []
        for idx in sorted(results_by_index.keys()):
            ordered_results.extend(results_by_index[idx])

        return ordered_results

Pipeline model: prompt the LLM to produce a CoT explanation of the relation (premise, hypothesis) - then, given the explanation, produce a label.

In [14]:

class BatchedNLIPipelinePredictor(dspy.Module):
    """
    Two-stage NLI predictor:
    1. Generate reasoning per example
    2. Classify into entailment/neutral/contradiction
    """

    def __init__(self, model, batch_size=15, max_workers=6):
        super().__init__()
        self.model = model
        self.batch_size = batch_size
        self.max_workers = max_workers

    def _generate_reasoning(self, examples, batch_index):
        """Stage 1: Ask LLM for short reasoning per example"""
        print(f"[Batch {batch_index}] Generating reasoning for {len(examples)} examples")

        prompt = (
            "For each example, explain briefly (1-2 sentences) whether the hypothesis "
            "follows from, contradicts, or is unrelated to the premise. "
            "Output one line per example, reasoning only, no label.\n\n"
        )
        for idx, ex in enumerate(examples, start=1):
            prompt += (
                f"Example {idx}:\n"
                f"Premise: {ex.premise}\n"
                f"Hypothesis: {ex.hypothesis}\n"
            )
        prompt += (
            f"\nOutput exactly {len(examples)} lines of reasoning, "
            "one per example, in order."
        )

        # Single API call
        response = self.model(prompt)
        response_text = "\n".join(response) if isinstance(response, list) else str(response)

        # Extract reasoning lines
        reasonings = [line.strip() for line in response_text.splitlines() if line.strip()]

        # Pad missing
        while len(reasonings) < len(examples):
            reasonings.append("No reasoning provided.")

        return reasonings[:len(examples)]

    def _classify_from_reasoning(self, examples, reasonings, batch_index):
        """Stage 2: Use reasoning to classify into entailment/neutral/contradiction"""
        print(f"[Batch {batch_index}] Classifying examples based on reasoning")

        prompt = (
            "Classify the relationship between premise and hypothesis based on the reasoning provided.\n"
            "Respond in format: label || reasoning\n"
            "Label ∈ {entailment, neutral, contradiction}.\n\n"
        )

        for idx, (ex, reason) in enumerate(zip(examples, reasonings), start=1):
            prompt += (
                f"Example {idx}:\n"
                f"Premise: {ex.premise}\n"
                f"Hypothesis: {ex.hypothesis}\n"
                f"Reasoning: {reason}\n"
            )

        prompt += (
            f"\nNow output exactly {len(examples)} lines like:\n"
            "entailment || <reasoning>\nneutral || <reasoning>\n..."
        )

        response = self.model(prompt)
        response_text = "\n".join(response) if isinstance(response, list) else str(response)

        results = []
        lines = [line.strip() for line in response_text.splitlines() if line.strip()]
        for ex, line in zip(examples, lines):
            m = re.match(r"(?i)\s*(entailment|neutral|contradiction)\s*\|\|\s*(.*)", line)
            if m:
                label = m.group(1).lower()
                reasoning = m.group(2).strip()
            else:
                label, reasoning = "unknown", "no reasoning"
            results.append((label, reasoning))

        # Pad missing
        while len(results) < len(examples):
            results.append(("unknown", "no reasoning"))

        return results

    def _process_chunk(self, examples, batch_index):
        """Full two-stage pipeline for one batch"""
        reasonings = self._generate_reasoning(examples, batch_index)
        results = self._classify_from_reasoning(examples, reasonings, batch_index)
        return results

    def forward(self, examples):
        if not examples:
            return []

        # Split into batches
        chunks = [
            examples[i:i+self.batch_size]
            for i in range(0, len(examples), self.batch_size)
        ]

        results_by_index = {}
        with ThreadPoolExecutor(max_workers=self.max_workers) as executor:
            futures = {
                executor.submit(self._process_chunk, chunk, idx): idx
                for idx, chunk in enumerate(chunks)
            }
            for future in as_completed(futures):
                batch_index = futures[future]
                results_by_index[batch_index] = future.result()

        # Flatten results in original order
        ordered_results = []
        for idx in sorted(results_by_index.keys()):
            ordered_results.extend(results_by_index[idx])

        return ordered_results


In [18]:
print("\n\nEvaluating JOINT results on dev_r3\n\n")

dev_r3_llm_joint_results = test_nli_predictor(
    dataset=dataset,
    split_name='dev_r3',
    model=lm,
    predictor_class=BatchedNLIJointPredictor,
    example_extractor=extract_example,
    batch_size=15,
    max_workers=6,
    max_examples=1200,
    uid_key='uid'
)

print("\n\nEvaluating PIPELINE results on dev_r3\n\n")

dev_r3_llm_pipeline_results = test_nli_predictor(
    dataset=dataset,
    split_name='dev_r3',
    model=lm,
    predictor_class=BatchedNLIPipelinePredictor,
    example_extractor=extract_example,
    batch_size=15,
    max_workers=6,
    max_examples=1200,
    uid_key='uid'
)



Evaluating JOINT results on dev_r3


[Batch 0] Starting processing with 15 examples
[Batch 0] Finished processing
[Batch 1] Starting processing with 15 examples
[Batch 1] Finished processing
[Batch 2] Starting processing with 15 examples
[Batch 2] Finished processing
[Batch 4] Starting processing with 15 examples
[Batch 3] Starting processing with 15 examples
[Batch 3] Finished processing
[Batch 5] Starting processing with 15 examples
[Batch 6] Starting processing with 15 examples
[Batch 7] Starting processing with 15 examples
[Batch 8] Starting processing with 15 examples
[Batch 9] Starting processing with 15 examples
[Batch 9] Finished processing
[Batch 10] Starting processing with 15 examples
[Batch 6] Finished processing
[Batch 11] Starting processing with 15 examples
[Batch 5] Finished processing
[Batch 12] Starting processing with 15 examples
[Batch 7] Finished processing
[Batch 13] Starting processing with 15 examples
[Batch 8] Finished processing
[Batch 14] Starting processin

In [20]:
print("\n\nEvaluating JOINT model results on dev_r3\n\n")
evaluate_predictions(
    ordered_llm_predictions=dev_r3_llm_joint_results,
    gold_labels=dataset['dev_r3']['label'],
    label2id={"entailment": 0, "neutral": 1, "contradiction": 2}
)

print("\n\nEvaluating PIPELINE model results on dev_r3\n\n")
evaluate_predictions(
    ordered_llm_predictions=dev_r3_llm_pipeline_results,
    gold_labels=dataset['dev_r3']['label'],
    label2id={"entailment": 0, "neutral": 1, "contradiction": 2}
)




Evaluating JOINT model results on dev_r3


Evaluation results:
  Accuracy:  0.6458333333333334
  Precision: 0.6431048878911784
  Recall:    0.6458333333333334
  F1:        0.6441062850126622


Evaluating PIPELINE model results on dev_r3


Evaluation results:
  Accuracy:  0.625
  Precision: 0.6232593331744185
  Recall:    0.625
  F1:        0.6218908466082799



4) Define a similarity threshold, "acceptable" - see "senten"ce-transformers.ipynb", 
https://dspy.ai/tutorials/output_refinement/best-of-n-and-refine/
5) Define a similarity test. **lets SUM the number of acceptable reasons, and print that.** 
6) Perform the 3 passages tests
7) print the results in a clean and meaningful manner
10) We need to justify the hybrid approach - using dspy as wrapper but manually prompting , and also verify if he will downgrade for that.


We picked the general purpose model "all-MiniLM-L6-v2" over 2 other options:
1) all-mpnet-base-v2 : has slightly better performance , but much slower (x5)
2) STS models such as distiluse-base-multilingual-cased-v1, distiluse-base-multilingual-cased-v2: trained on less parameters and weaker for english since they are Multilingual

See https://sbert.net/docs/sentence_transformer/pretrained_models.html for reference

In [21]:
from sentence_transformers import SentenceTransformer, util
model = SentenceTransformer('all-MiniLM-L6-v2')

In [34]:
from sklearn.model_selection import train_test_split
import numpy as np

# Load Hugging Face metrics
accuracy_metric = load("accuracy")
precision_metric = load("precision")
recall_metric = load("recall")
f1_metric = load("f1")


def optimize_reasoning_threshold(examples, model, test_size=0.2, thresholds=None):
    """
    Manually optimize semantic similarity threshold for reason relevance detection.

    Args:
        examples (list[dict]): Each dict must have keys: 'premise', 'hypothesis', 'reason', 'label'
        model: Preloaded SentenceTransformer
        test_size (float): Fraction for validation split
        thresholds (list[float]): Thresholds to try (defaults to np.arange(0.3,0.91,0.05))

    Returns:
        (float, int): best threshold, count of items above it
    """
    if thresholds is None:
        thresholds = np.arange(0.3, 0.91, 0.05)

    # Split data
    train_data, val_data = train_test_split(examples, test_size=test_size, random_state=42)
    print(f"[DEBUG] Validation examples: {len(val_data)}")

    # Encode embeddings for validation
    val_embeddings = []
    for ex in val_data:
        ph_text = ex["premise"] + " " + ex["hypothesis"]
        ph_emb = model.encode(ph_text, convert_to_tensor=True)
        r_emb = model.encode(ex["reason"], convert_to_tensor=True)
        val_embeddings.append((ph_emb, r_emb, ex["label"]))

    # Manual threshold search
    best_f1 = -1
    best_threshold = None
    best_sims = None

    for thresh in thresholds:
        preds, labels, sims = [], [], []
        for ph_emb, r_emb, label in val_embeddings:
            sim = util.cos_sim(ph_emb, r_emb).item()
            sims.append(sim)
            preds.append(int(sim >= thresh))
            labels.append(label)

        f1 = f1_metric.compute(predictions=preds, references=labels)['f1']
        if f1 > best_f1:
            best_f1 = f1
            best_threshold = thresh
            best_sims = sims

        print(f"[DEBUG] Threshold {thresh:.2f} → F1={f1:.3f}")

    above_count = sum(s >= best_threshold for s in best_sims)
    print(f"\nOptimal Threshold: {best_threshold:.3f}")
    print(f"Number of items above threshold: {above_count} / {len(best_sims)}")
    print(f"Percentage above threshold: {above_count / len(best_sims) * 100:.2f}%")
    print(f"Best F1 Score: {best_f1:.3f}")

    return best_threshold, above_count


In [None]:
def unify_ph_with_reasons_ordered(test_r3, results):
    """
    Merge PH pairs from test_r3 with results and create binary label
    indicating if dataset label matches result label.

    Args:
        test_r3: Dataset with features ['uid','premise','hypothesis','label',...]
        results: List of tuples (uid, label_str, reason_text)

    Returns:
        List of dicts with: 'uid','premise','hypothesis','reason','label' (binary)
    """
    # Each int label maps to a set containing its string and int
    label_map = {
        0: {"entailment", 0},
        1: {"neutral", 1},
        2: {"contradiction", 2},
    }

    merged = []
    for row, (res_uid, res_label, reason_text) in zip(test_r3, results):
        assert row["uid"] == res_uid, f"UID mismatch: {row['uid']} != {res_uid}"

        label_set = label_map[row["label"]]
        binary_label = 1 if res_label in label_set else 0

        merged.append({
            "uid": row["uid"],
            "premise": row["premise"],
            "hypothesis": row["hypothesis"],
            "reason": reason_text,
            "label": binary_label
        })

    return merged


# # Example usage:
# test1_pipeline = unify_ph_with_reasons_ordered(dataset['dev_r3'], dev_r3_llm_pipeline_results)

# for i in range(30):
#     print(test1_pipeline[i])  # Print first 3 examples for verification

# TEST 1: Human reason vs. LLM reason

# TEST 2: (premise,hypothesis) vs. LLM reason

In [42]:
test2_joint = unify_ph_with_reasons_ordered(dataset['dev_r3'], dev_r3_llm_joint_results)
print("***Comparing the joint prompt model with the (premise, hypothesis) pair****")
optimize_reasoning_threshold(test2_joint, model)
print("\n\n")
test2_pipeline = unify_ph_with_reasons_ordered(dataset['dev_r3'], dev_r3_llm_pipeline_results)
print("***Comparing the PIPELINE prompt model with the (premise, hypothesis) pair****")
optimize_reasoning_threshold(test2_pipeline, model)


***Comparing the joint prompt model with the (premise, hypothesis) pair****
[DEBUG] Validation examples: 240


  return forward_call(*args, **kwargs)


[DEBUG] Threshold 0.30 → F1=0.762
[DEBUG] Threshold 0.35 → F1=0.756
[DEBUG] Threshold 0.40 → F1=0.751
[DEBUG] Threshold 0.45 → F1=0.737
[DEBUG] Threshold 0.50 → F1=0.693
[DEBUG] Threshold 0.55 → F1=0.620
[DEBUG] Threshold 0.60 → F1=0.537
[DEBUG] Threshold 0.65 → F1=0.432
[DEBUG] Threshold 0.70 → F1=0.336
[DEBUG] Threshold 0.75 → F1=0.202
[DEBUG] Threshold 0.80 → F1=0.084
[DEBUG] Threshold 0.85 → F1=0.000
[DEBUG] Threshold 0.90 → F1=0.000

Optimal Threshold: 0.300
Number of items above threshold: 214 / 240
Percentage above threshold: 89.17%
Best F1 Score: 0.762



***Comparing the PIPELINE prompt model with the (premise, hypothesis) pair****
[DEBUG] Validation examples: 240


  return forward_call(*args, **kwargs)


[DEBUG] Threshold 0.30 → F1=0.730
[DEBUG] Threshold 0.35 → F1=0.707
[DEBUG] Threshold 0.40 → F1=0.679
[DEBUG] Threshold 0.45 → F1=0.653
[DEBUG] Threshold 0.50 → F1=0.612
[DEBUG] Threshold 0.55 → F1=0.544
[DEBUG] Threshold 0.60 → F1=0.470
[DEBUG] Threshold 0.65 → F1=0.317
[DEBUG] Threshold 0.70 → F1=0.171
[DEBUG] Threshold 0.75 → F1=0.066
[DEBUG] Threshold 0.80 → F1=0.014
[DEBUG] Threshold 0.85 → F1=0.000
[DEBUG] Threshold 0.90 → F1=0.000

Optimal Threshold: 0.300
Number of items above threshold: 214 / 240
Percentage above threshold: 89.17%
Best F1 Score: 0.730


(np.float64(0.3), np.int64(214))

# Test 3: (premise,hypothesis) vs. HUMAN reason

In [52]:
given_results = [(row['uid'], row['label'], row['reason']) for row in dataset['dev_r3']]
test3 = unify_ph_with_reasons_ordered(dataset['dev_r3'], given_results)
print("***Comparing the given HUMAN reasons with the (premise, hypothesis) pair****")
optimize_reasoning_threshold(test3, model)

***Comparing the given HUMAN reasons with the (premise, hypothesis) pair****
[DEBUG] Validation examples: 240


  return forward_call(*args, **kwargs)


[DEBUG] Threshold 0.30 → F1=0.832
[DEBUG] Threshold 0.35 → F1=0.763
[DEBUG] Threshold 0.40 → F1=0.689
[DEBUG] Threshold 0.45 → F1=0.617
[DEBUG] Threshold 0.50 → F1=0.476
[DEBUG] Threshold 0.55 → F1=0.416
[DEBUG] Threshold 0.60 → F1=0.328
[DEBUG] Threshold 0.65 → F1=0.235
[DEBUG] Threshold 0.70 → F1=0.147
[DEBUG] Threshold 0.75 → F1=0.080
[DEBUG] Threshold 0.80 → F1=0.033
[DEBUG] Threshold 0.85 → F1=0.008
[DEBUG] Threshold 0.90 → F1=0.000

Optimal Threshold: 0.300
Number of items above threshold: 171 / 240
Percentage above threshold: 71.25%
Best F1 Score: 0.832


(np.float64(0.3), np.int64(171))