In [None]:
from typing import Callable, Iterable
from dataclasses import dataclass
from scipy.stats import pearsonr

from models.nli.nli_base import NLIZeroshotClassifier


@dataclass
class NLIParams:
    candidate_labels: list[str]
    hypothesis_template: str
    value_getter: Callable[[Iterable[float]], float]

    def __str__(self) -> str:
        return f"{self.candidate_labels=} | {self.hypothesis_template=}"


def test_params(
    model: NLIZeroshotClassifier, params: NLIParams, texts: list[str], labels: list[str]
) -> None:
    with model.set_options(
        candidate_labels=params.candidate_labels,
        hypothesis_template=params.hypothesis_template,
    ) as m:
        scores = [params.value_getter(score) for score in m.evaluate_segments(texts)]

    # Calculate correlation between scores and true labels
    correlation, _ = pearsonr(scores, labels)
    return correlation

  from .autonotebook import tqdm as notebook_tqdm
2025-10-19 20:13:46.122515: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1760897626.162219  639386 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1760897626.186683  639386 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-10-19 20:13:46.269861: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [None]:
from models.nli.nli_roberta import NLIRoberta


model = NLIRoberta()

Device set to use cpu


In [3]:
import pandas as pd
import sys
from pathlib import Path

sys.path.append(str(Path.cwd().parent))
from utils import DATA_DIR

train_set = pd.read_parquet(DATA_DIR / "datasets" / "small" / "train.parquet")

In [4]:
params_for_comparison = [
    NLIParams(
        candidate_labels=["detailed", "not detailed"],
        hypothesis_template="This text is {} in terms of visual details of characters, setting, or environment.",
        value_getter=lambda scores: scores[0],
    ),
    NLIParams(
        candidate_labels=[
            "can be easily visualized with specific sensory details",
            "is difficult to visualize or abstract",
        ],
        hypothesis_template="This text {}",
        value_getter=lambda scores: scores[0],
    ),
]

correlation = []
for params in params_for_comparison:
    corr = test_params(
        model, params, train_set["text"].tolist(), train_set["label"].tolist()
    )
    correlation.append(corr)
    print(f"{params}: {corr}")

self.candidate_labels=['detailed', 'not detailed'] | self.hypothesis_template='This text is {} in terms of visual details of characters, setting, or environment.': 0.6750189928326207
self.candidate_labels=['can be easily visualized with specific sensory details', 'is difficult to visualize or abstract'] | self.hypothesis_template='This text {}': 0.19942125667573896


### Best

microsoft/deberta-base-mnli

MoritzLaurer/deberta-v3-base-zeroshot-v2.0

candidate_labels=['visual', 'not visual'], hypothesis_template='This text is {} in terms of sensory details, imagery, characters, environment, and vivid descriptions.': 0.3074255046401304
candidate_labels=['visual', 'non_visual'], hypothesis_template='This text is {} in terms of sensory details, imagery, characters, and vivid descriptions of foreground and background.': 0.31445027174020534
candidate_labels=['visual', 'non_visual'], hypothesis_template='This text is {} in terms of sensory details, imagery, characters, environment, and vivid descriptions of foreground and background.': 0.32622171646256426
candidate_labels=['descriptive', 'non_descriptive'], hypothesis_template='This text is {} in terms of visual details of characters, setting, or environment.': 0.2955842441890373
candidate_labels=['disagree', 'agree', 'strongly agree'], hypothesis_template='I {} that the text is visual in terms of sensory details, imagery, and vivid descriptions.': 0.11353990327893874
candidate_labels=['visual', 'non_visual'], hypothesis_template='This text is {} in terms of visual details of characters, setting, or environment.': 0.20003419028989294
candidate_labels=['detailed', 'not_detailed'], hypothesis_template='This text is {} in terms of visual details of characters, setting, or environment.': 0.38513134801015203
candidate_labels=['descriptive', 'non_descriptive'], hypothesis_template='This text is {} in terms of sensory details, imagery, and vivid descriptions.': 0.1388104796342361
candidate_labels=['visual', 'non_visual'], hypothesis_template='This text is {} in terms of sensory details, imagery, and vivid descriptions.': 0.3462052985649622
candidate_labels=['detailed', 'not_detailed'], hypothesis_template='This text is {} in terms of sensory details, imagery, and vivid descriptions.': 0.2931661821955478

self.candidate_labels=['This text contains no visual imagery', 'This text describes a simple object, body part, or animal', 'This text describes a simple object or person in action', 'This text describes a detailed object, place, or identifiable character', 'This text describes a detailed place or character performing an action', 'This text describes a complete scene or detailed face', 'This text describes a complete scene with action or movement', 'This text describes a rich scene with multiple visual elements including character, setting, and details', 'This text describes a rich scene with multiple visual elements and dynamic action', 'This text vividly depicts a full scene with background, foreground, atmospheric details, and time of day'] | self.hypothesis_template='This passage depicts {}': 0.14101946631175694
self.candidate_labels=['no visual description', 'a basic visible object or person', 'a basic object or person doing something', 'a detailed location, object, or recognizable character', 'a detailed place or character in motion', 'a scene with face details or minimal setting', 'a scene with setting and action', 'a rich visual scene with character, setting, and descriptive details', 'a rich visual scene with multiple elements in motion', 'a vivid, complete scene with atmosphere, lighting, and layered visual information'] | self.hypothesis_template='The visual richness of this text shows {}': 0.010100976665978988
self.candidate_labels=['zero visual imagery', 'minimal visual content - single simple element', 'low visual detail - simple element with action', 'moderate visual content - detailed element or basic character', 'moderate visual scene - setting with action', 'good visual description - complete scene or detailed character', 'rich visual scene - setting with movement and details', 'very rich visual scene - multiple described elements and context', 'vivid dynamic scene - rich setting with action and details', 'fully immersive visual scene - atmospheric, layered, cinematic description'] | self.hypothesis_template='The level of visual description in this text is: {}': 0.15886393925343895

self.candidate_labels=['contains vivid visual description with multiple specific details', 'contains minimal or no visual description'] | self.hypothesis_template='This text {}': 0.2468695717544066
self.candidate_labels=['describes a visually complete scene with setting and details', 'describes abstract concepts without visual imagery'] | self.hypothesis_template='This text {}': 0.2814456504320367
self.candidate_labels=['can be easily visualized with specific sensory details', 'is difficult to visualize or abstract'] | self.hypothesis_template='This text {}': 0.34294634460914136
self.candidate_labels=['provides rich visual description of people, places, or objects', 'provides little visual description'] | self.hypothesis_template='This text {}': 0.10715816625655927

deberta - 5:45
candidate_labels=['detailed', 'not_detailed'], hypothesis_template='This text is {} in terms of visual details of characters, setting, or environment.': 0.38513134801015203
self.candidate_labels=['can be easily visualized with specific sensory details', 'is difficult to visualize or abstract'] | self.hypothesis_template='This text {}': 0.34294634460914136

bart-large - 11:06
candidate_labels=['detailed', 'not_detailed'], hypothesis_template='This text is {} in terms of visual details of characters, setting, or environment.': 0.350
self.candidate_labels=['can be easily visualized with specific sensory details', 'is difficult to visualize or abstract'] | self.hypothesis_template='This text {}': 0.276

MoritzLaurer/ModernBERT-large-zeroshot-v2.0 - 18:38
candidate_labels=['detailed', 'not_detailed'], hypothesis_template='This text is {} in terms of visual details of characters, setting, or environment.': 0.486
self.candidate_labels=['can be easily visualized with specific sensory details', 'is difficult to visualize or abstract'] | self.hypothesis_template='This text {}': 0.421

MoritzLaurer/deberta-v3-large-zeroshot-v2.0 - 21:09
self.candidate_labels=['detailed', 'not_detailed'] | self.hypothesis_template='This text is {} in terms of visual details of characters, setting, or environment.': 0.510
self.candidate_labels=['can be easily visualized with specific sensory details', 'is difficult to visualize or abstract'] | self.hypothesis_template='This text {}': 0.387

MoritzLaurer/deberta-v3-base-zeroshot-v2.0 8:44
self.candidate_labels=['detailed', 'not_detailed'] | self.hypothesis_template='This text is {} in terms of visual details of characters, setting, or environment.': 0.508 -> 0.496
self.candidate_labels=['can be easily visualized with specific sensory details', 'is difficult to visualize or abstract'] | self.hypothesis_template='This text {}': 0.245

MoritzLaurer/roberta-large-zeroshot-v2.0-c 16:48
self.candidate_labels=['detailed', 'not_detailed'] | self.hypothesis_template='This text is {} in terms of visual details of characters, setting, or environment.': 0.572 -> 0.578
self.candidate_labels=['can be easily visualized with specific sensory details', 'is difficult to visualize or abstract'] | self.hypothesis_template='This text {}': 0.239

MoritzLaurer/roberta-base-zeroshot-v2.0-c 21:43
self.candidate_labels=['detailed', 'not detailed'] | self.hypothesis_template='This text is {} in terms of visual details of characters, setting, or environment.': 0.6750189928326207
self.candidate_labels=['can be easily visualized with specific sensory details', 'is difficult to visualize or abstract'] | self.hypothesis_template='This text {}': 0.19942125667573896