Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
26 changes: 20 additions & 6 deletions packages/sdk/server-ai/src/ldai/client.py
Original file line number Diff line number Diff line change
Expand Up @@ -215,9 +215,27 @@ def _judge_config(
default: AIJudgeConfigDefault,
variables: Optional[Dict[str, Any]] = None,
) -> AIJudgeConfig:
if variables is not None:
if variables.get('message_history') is not None:
log.warning(
"The variable 'message_history' is reserved by the judge and will be ignored."
)
if variables.get('response_to_evaluate') is not None:
log.warning(
"The variable 'response_to_evaluate' is reserved by the judge and will be ignored."
)

# Re-inject the reserved variables as their literal placeholders so they
# survive Mustache interpolation in ``__evaluate``. Without this, legacy
# templates like ``{{message_history}}`` get rendered to empty strings and
# ``_strip_legacy_judge_messages`` below cannot detect them.
extended_variables = dict(variables) if variables else {}
extended_variables['message_history'] = '{{message_history}}'
extended_variables['response_to_evaluate'] = '{{response_to_evaluate}}'

(model, provider, messages, instructions,
tracker_factory, enabled, judge_configuration, variation) = self.__evaluate(
key, context, default.to_dict(), variables
key, context, default.to_dict(), extended_variables
)

def _extract_evaluation_metric_key(variation: Dict[str, Any]) -> Optional[str]:
Expand Down Expand Up @@ -336,12 +354,8 @@ def _create_judge_instance(
when materializing judges referenced by an AI config's judge configuration.
"""
try:
extended_variables = dict(variables) if variables else {}
extended_variables['message_history'] = '{{message_history}}'
extended_variables['response_to_evaluate'] = '{{response_to_evaluate}}'

judge_config = self._judge_config(
key, context, default or _DISABLED_JUDGE_DEFAULT, extended_variables
key, context, default or _DISABLED_JUDGE_DEFAULT, variables
)

if not judge_config.enabled:
Expand Down
134 changes: 133 additions & 1 deletion packages/sdk/server-ai/tests/test_judge.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,12 @@
"""Tests for Judge functionality."""

from unittest.mock import AsyncMock, MagicMock, call
from unittest.mock import AsyncMock, MagicMock, call, patch

import pytest
from ldclient import Config, Context, LDClient
from ldclient.integrations.test_data import TestData

from ldai import LDAIClient
from ldai.judge import Judge, _strip_legacy_judge_messages
from ldai.judge.evaluation_schema_builder import EvaluationSchemaBuilder
from ldai.models import (
Expand Down Expand Up @@ -543,6 +544,137 @@ async def test_evaluate_messages_calls_evaluate(
assert tracker.track_metrics_of_async.called


class TestJudgeConfigStripsLegacyMessages:
"""Tests for ``LDAIClient.judge_config()`` legacy-message stripping.

Both ``LDAIClient.judge_config()`` (the direct public API) and
``LDAIClient.create_judge()`` (the wrapper API) reach the same internal
``_judge_config()`` method. ``_judge_config()`` is responsible for
injecting ``message_history``/``response_to_evaluate`` markers so they
survive Mustache rendering, then stripping any legacy template messages
before returning the config.
"""

@pytest.fixture
def context(self) -> Context:
return Context.create('user-key')

def _make_client(self, td: TestData) -> LDAIClient:
config = Config('sdk-key', update_processor_class=td, send_events=False)
return LDAIClient(LDClient(config=config))

def test_judge_config_strips_legacy_messages_from_returned_config(self, context):
"""Calling ``judge_config()`` directly (no variables) still strips legacy messages.

This is the regression for the bug where legacy messages leaked through
the public ``judge_config()`` entry point because reserved-variable
markers were only injected in ``_create_judge_instance``.
"""
td = TestData.data_source()
td.update(
td.flag('legacy-judge')
.variations({
'model': {'name': 'gpt-4'},
'provider': {'name': 'openai'},
'messages': [
{'role': 'system', 'content': 'You are a judge.'},
{'role': 'assistant', 'content': '{{message_history}}'},
{'role': 'user', 'content': 'Evaluate: {{response_to_evaluate}}'},
],
'evaluationMetricKey': '$ld:ai:judge:relevance',
'_ldMeta': {'enabled': True, 'variationKey': 'judge-v1', 'version': 1},
})
.variation_for_all(0)
)
client = self._make_client(td)

result = client.judge_config('legacy-judge', context)

assert result.enabled is True
assert result.messages is not None
assert len(result.messages) == 1
assert result.messages[0].role == 'system'
assert result.messages[0].content == 'You are a judge.'

def test_judge_config_passes_user_variables_to_template(self, context):
"""User variables are still interpolated into the system message."""
td = TestData.data_source()
td.update(
td.flag('parametric-judge')
.variations({
'model': {'name': 'gpt-4'},
'provider': {'name': 'openai'},
'messages': [
{'role': 'system', 'content': 'You are a {{tone}} judge.'},
],
'evaluationMetricKey': '$ld:ai:judge:relevance',
'_ldMeta': {'enabled': True, 'variationKey': 'judge-v1', 'version': 1},
})
.variation_for_all(0)
)
client = self._make_client(td)

result = client.judge_config(
'parametric-judge', context, variables={'tone': 'strict'}
)

assert result.messages is not None
assert result.messages[0].content == 'You are a strict judge.'

def test_judge_config_warns_on_reserved_variables(self, context):
"""``_judge_config`` warns when callers pass reserved variable names."""
td = TestData.data_source()
td.update(
td.flag('judge-config')
.variations({
'model': {'name': 'gpt-4'},
'provider': {'name': 'openai'},
'messages': [{'role': 'system', 'content': 'You are a judge.'}],
'evaluationMetricKey': '$ld:ai:judge:relevance',
'_ldMeta': {'enabled': True, 'variationKey': 'judge-v1', 'version': 1},
})
.variation_for_all(0)
)
client = self._make_client(td)

with patch('ldai.client.log') as mock_log:
client.judge_config(
'judge-config',
context,
variables={
'message_history': 'should be ignored',
'response_to_evaluate': 'should be ignored',
},
)

warning_messages = [c.args[0] for c in mock_log.warning.call_args_list]
assert any("'message_history' is reserved" in m for m in warning_messages)
assert any("'response_to_evaluate' is reserved" in m for m in warning_messages)

def test_judge_config_does_not_warn_without_reserved_variables(self, context):
"""No warnings should be emitted when callers pass non-reserved variables."""
td = TestData.data_source()
td.update(
td.flag('judge-config')
.variations({
'model': {'name': 'gpt-4'},
'provider': {'name': 'openai'},
'messages': [{'role': 'system', 'content': 'You are a judge.'}],
'evaluationMetricKey': '$ld:ai:judge:relevance',
'_ldMeta': {'enabled': True, 'variationKey': 'judge-v1', 'version': 1},
})
.variation_for_all(0)
)
client = self._make_client(td)

with patch('ldai.client.log') as mock_log:
client.judge_config('judge-config', context, variables={'tone': 'strict'})

warning_messages = [c.args[0] for c in mock_log.warning.call_args_list]
assert not any("'message_history' is reserved" in m for m in warning_messages)
assert not any("'response_to_evaluate' is reserved" in m for m in warning_messages)


class TestEvaluationSchemaBuilder:
"""Tests for EvaluationSchemaBuilder."""

Expand Down
Loading