In [6]:
import os 
from openai import OpenAI
from dotenv import load_dotenv
from pydantic import BaseModel

class DistanceOutput(BaseModel):
    distance: float
    
load_dotenv()
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))

system_prompt = """You are given two short phrases. Output strictly a JSON object with:
  - "distance": a float ≥0.0 and ≤1.0
    • 0.0 = identical
    • values near 0 = very similar
    • values near 1 = totally dissimilar

Return it **as JSON and nothing else**."""

guess   = "Baseball"
concept = "Sports"

prompt = f'Phrase 1: {guess}, Phrase 2: {concept}'

response = client.responses.parse(
    model="gpt-4.1-nano-2025-04-14",
    input=[
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": prompt},
    ],
    text_format=DistanceOutput
)

print(response.output_parsed)

distance=0.2


In [15]:
class _DistanceSchema(BaseModel):
    distance: float

class LLMDistance:
    """LLM-based semantic distance with lazy OpenAI client init."""
    
    _client = None
    _model = "gpt-4.1-mini-2025-04-14"
    _system_prompt = """
        # Instructions
        You are given two short phrases. Output strictly a JSON object with:
          - "distance": a float ≥0.0 and ≤1.0
            • values < .2 = similar enough you wouldn't discount someone from saying one vs. other in a quiz
            • .2 < values < .4 = pretty close but not really the same, for example if one is a subgroup of the other
            • values > .7 = not really close at all 
        Return it **as JSON and nothing else**

        # Examples:
        <distance_inputs id="example-1">
            Phrase 1: Sports, Phrase 2: Athletics 
        </distance_inputs>

        <assistant_response id="example-1">
            {distance : 0.1}
        </assistant_response>

        <assistance_explanation id="example-1">
            Sports and Athletics are basically identical answers (so < .2) but not the exact same (so > 0)
        </assistant_explanation>

        <distance_inputs id="example-2">
            Phrase 1: Sports, Phrase 2: Baseball 
        </distance_inputs>

        <assistant_response id="example-2">
            {distance : 0.3}
        </assistant_response>

        <assistance_explanation id="example-2">
            Sports and Baseball are definitely related (so < .4) but one is a subgroup of the other (so > .2)
        </assistant_explanation>

        <distance_inputs id="example-3">
            Phrase 1: Sports, Phrase 2: Eating 
        </distance_inputs>

        <assistant_response id="example-3">
            {distance : 1}
        </assistant_response>

        <assistance_explanation id="example-3">
            Sports and Eating are completely unrelated
        </assistant_explanation>
    """
    @classmethod
    def _get_client(cls) -> OpenAI:
        if cls._client is None:
            load_dotenv()  
            cls._client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
        return cls._client

    @classmethod
    def distance(cls, a: str, b: str) -> float:
        """
        Compute semantic distance ∈ [0,1] between phrases `a` and `b`
        (0 = identical, 1 = unrelated).  Uses cached OpenAI client.
        """
        prompt = f"Phrase 1: {a}, Phrase 2: {b}"
        response = cls._get_client().responses.parse(
            model=cls._model,
            input=[
                {"role": "system", "content": cls._system_prompt},
                {"role": "user", "content": prompt},
            ],
            text_format=_DistanceSchema,
            temperature=0
        )
        return response.output_parsed.distance


LLMDistance.distance("Baseball", "Sports")


0.3

In [16]:
for (concept, guess) in zip(concepts:= ['Sports','Movies','Food', 'Food'], guesses:= ['Chess', 'Media','Cusine', 'Fishing']):
    print(f"{concept=}, {guess=} distance={LLMDistance.distance(concept, guess)}")

concept='Sports', guess='Chess' distance=0.8
concept='Movies', guess='Media' distance=0.3
concept='Food', guess='Cusine' distance=0.15
concept='Food', guess='Fishing' distance=0.8
