From 775aaca7866133737130150e1d0f3cd63c942d2e Mon Sep 17 00:00:00 2001 From: jsonbailey Date: Tue, 5 May 2026 14:40:02 -0500 Subject: [PATCH 1/4] fix: build judge input as string; strip legacy message_history/response_to_evaluate messages Co-Authored-By: Claude Sonnet 4.6 --- .../ldai_langchain/langchain_agent_runner.py | 4 +- .../ldai_langchain/langchain_model_runner.py | 18 +- .../src/ldai_openai/openai_agent_runner.py | 4 +- .../src/ldai_openai/openai_model_runner.py | 20 +-- .../sdk/server-ai/src/ldai/judge/__init__.py | 76 ++++---- .../server-ai/src/ldai/providers/runner.py | 6 +- packages/sdk/server-ai/tests/test_judge.py | 169 ++++++++++++++++-- 7 files changed, 210 insertions(+), 87 deletions(-) diff --git a/packages/ai-providers/server-ai-langchain/src/ldai_langchain/langchain_agent_runner.py b/packages/ai-providers/server-ai-langchain/src/ldai_langchain/langchain_agent_runner.py index af9e9787..02d6b0e3 100644 --- a/packages/ai-providers/server-ai-langchain/src/ldai_langchain/langchain_agent_runner.py +++ b/packages/ai-providers/server-ai-langchain/src/ldai_langchain/langchain_agent_runner.py @@ -33,7 +33,7 @@ def __init__(self, agent: Any): async def run( self, - input: Any, + input: str, output_type: Optional[Dict[str, Any]] = None, ) -> RunnerResult: """ @@ -42,7 +42,7 @@ async def run( Delegates to the compiled LangChain agent, which handles the tool-calling loop internally. - :param input: The user prompt or input to the agent + :param input: The user prompt string to the agent :param output_type: Reserved for future structured output support; currently ignored. :return: :class:`RunnerResult` with ``content``, ``raw`` response, and diff --git a/packages/ai-providers/server-ai-langchain/src/ldai_langchain/langchain_model_runner.py b/packages/ai-providers/server-ai-langchain/src/ldai_langchain/langchain_model_runner.py index 341fd175..18e542dc 100644 --- a/packages/ai-providers/server-ai-langchain/src/ldai_langchain/langchain_model_runner.py +++ b/packages/ai-providers/server-ai-langchain/src/ldai_langchain/langchain_model_runner.py @@ -37,37 +37,25 @@ def get_llm(self) -> BaseChatModel: async def run( self, - input: Any, + input: str, output_type: Optional[Dict[str, Any]] = None, ) -> RunnerResult: """ Run the LangChain model with the given input. - :param input: A string prompt or a list of :class:`LDMessage` objects + :param input: A string prompt :param output_type: Optional JSON schema dict requesting structured output. When provided, ``parsed`` on the returned :class:`RunnerResult` is populated with the parsed JSON document. :return: :class:`RunnerResult` containing ``content``, ``metrics``, ``raw`` and (when ``output_type`` is set) ``parsed``. """ - messages = self._coerce_input(input) + messages = [LDMessage(role='user', content=input)] if output_type is not None: return await self._run_structured(messages, output_type) return await self._run_completion(messages) - # convert_messages_to_langchain only accepts List[LDMessage]; _coerce_input - # normalizes a bare string to [LDMessage(role='user', ...)] before that step. - @staticmethod - def _coerce_input(input: Any) -> List[LDMessage]: - if isinstance(input, str): - return [LDMessage(role='user', content=input)] - if isinstance(input, list): - return input - raise TypeError( - f"Unsupported input type for LangChainModelRunner.run: {type(input).__name__}" - ) - async def _run_completion(self, messages: List[LDMessage]) -> RunnerResult: try: langchain_messages = convert_messages_to_langchain(messages) diff --git a/packages/ai-providers/server-ai-openai/src/ldai_openai/openai_agent_runner.py b/packages/ai-providers/server-ai-openai/src/ldai_openai/openai_agent_runner.py index 7370c58d..a66713f5 100644 --- a/packages/ai-providers/server-ai-openai/src/ldai_openai/openai_agent_runner.py +++ b/packages/ai-providers/server-ai-openai/src/ldai_openai/openai_agent_runner.py @@ -47,7 +47,7 @@ def __init__( async def run( self, - input: Any, + input: str, output_type: Optional[Dict[str, Any]] = None, ) -> RunnerResult: """ @@ -56,7 +56,7 @@ async def run( Delegates to the OpenAI Agents SDK ``Runner.run``, which handles the tool-calling loop internally. - :param input: The user prompt or input to the agent + :param input: The user prompt string to the agent :param output_type: Reserved for future structured output support; currently ignored. :return: :class:`RunnerResult` with ``content``, ``raw`` response, and diff --git a/packages/ai-providers/server-ai-openai/src/ldai_openai/openai_model_runner.py b/packages/ai-providers/server-ai-openai/src/ldai_openai/openai_model_runner.py index 5e8ad33e..f04f6c12 100644 --- a/packages/ai-providers/server-ai-openai/src/ldai_openai/openai_model_runner.py +++ b/packages/ai-providers/server-ai-openai/src/ldai_openai/openai_model_runner.py @@ -35,39 +35,25 @@ def __init__( async def run( self, - input: Any, + input: str, output_type: Optional[Dict[str, Any]] = None, ) -> RunnerResult: """ Run the OpenAI model with the given input. - :param input: A string prompt or a list of :class:`LDMessage` objects + :param input: A string prompt :param output_type: Optional JSON schema dict requesting structured output. When provided, ``parsed`` on the returned :class:`RunnerResult` is populated with the parsed JSON document. :return: :class:`RunnerResult` containing ``content``, ``metrics``, ``raw`` and (when ``output_type`` is set) ``parsed``. """ - try: - messages = self._coerce_input(input) - except TypeError as error: - log.warning(f'OpenAI model runner received unsupported input type: {error}') - return RunnerResult(content='', metrics=LDAIMetrics(success=False, usage=None)) + messages = [LDMessage(role='user', content=input)] if output_type is not None: return await self._run_structured(messages, output_type) return await self._run_completion(messages) - @staticmethod - def _coerce_input(input: Any) -> List[LDMessage]: - if isinstance(input, str): - return [LDMessage(role='user', content=input)] - if isinstance(input, list): - return input - raise TypeError( - f"Unsupported input type for OpenAIModelRunner.run: {type(input).__name__}" - ) - async def _run_completion(self, messages: List[LDMessage]) -> RunnerResult: try: response = await self._client.chat.completions.create( diff --git a/packages/sdk/server-ai/src/ldai/judge/__init__.py b/packages/sdk/server-ai/src/ldai/judge/__init__.py index a5504ff0..3a83152d 100644 --- a/packages/sdk/server-ai/src/ldai/judge/__init__.py +++ b/packages/sdk/server-ai/src/ldai/judge/__init__.py @@ -1,9 +1,7 @@ """Judge implementation for AI evaluation.""" import random -from typing import Any, Dict, Optional, Tuple - -import chevron +from typing import Any, Dict, List, Optional, Tuple from ldai import log from ldai.judge.evaluation_schema_builder import EvaluationSchemaBuilder @@ -12,6 +10,30 @@ from ldai.providers.types import JudgeResult, RunnerResult +def _strip_legacy_judge_messages(messages: List[LDMessage]) -> List[LDMessage]: + """ + Remove legacy judge template messages from a message list. + + Strips any non-system message whose content contains ``{{message_history}}`` + or ``{{response_to_evaluate}}``. These were used by older judge configs to + indicate where the SDK should interpolate the evaluated conversation; new + configs omit them entirely and rely on the string input built by + :meth:`Judge._build_evaluation_input`. + + :param messages: The raw message list from the judge AI config. + :return: A new list with legacy template messages removed. + """ + result = [] + for msg in messages: + if msg.role != 'system' and ( + '{{message_history}}' in msg.content + or '{{response_to_evaluate}}' in msg.content + ): + continue + result.append(msg) + return result + + class Judge: """ Judge implementation that handles evaluation functionality and conversation management. @@ -65,11 +87,6 @@ async def evaluate( judge_result.error_message = 'Judge configuration is missing required evaluationMetricKey' return judge_result - if not self._ai_config.messages: - log.warning('Judge configuration must include messages') - judge_result.error_message = 'Judge configuration must include messages' - return judge_result - if random.random() > effective_rate: log.debug(f'Judge evaluation skipped due to sampling rate: {effective_rate}') return judge_result @@ -77,12 +94,12 @@ async def evaluate( judge_result.sampled = True tracker = self._ai_config.create_tracker() - messages = self._construct_evaluation_messages(input_text, output_text) + evaluation_input = self._build_evaluation_input(input_text, output_text) assert self._evaluation_response_structure is not None response = await tracker.track_metrics_of_async( lambda result: result.metrics, - lambda: self._model_runner.run(messages, output_type=self._evaluation_response_structure), + lambda: self._model_runner.run(evaluation_input, output_type=self._evaluation_response_structure), ) if response.parsed is None: @@ -142,38 +159,21 @@ def get_model_runner(self) -> Runner: """ return self._model_runner - def _construct_evaluation_messages(self, input_text: str, output_text: str) -> list[LDMessage]: - """ - Constructs evaluation messages by combining judge's config messages with input/output. - - :param input_text: The input text - :param output_text: The output text to evaluate - :return: List of messages for evaluation + def _build_evaluation_input(self, input_text: str, output_text: str) -> str: """ - if not self._ai_config.messages: - return [] + Build the string input for the judge runner. - messages: list[LDMessage] = [] - for msg in self._ai_config.messages: - # Interpolate message content with reserved variables - content = self._interpolate_message(msg.content, { - 'message_history': input_text, - 'response_to_evaluate': output_text, - }) - messages.append(LDMessage(role=msg.role, content=content)) + Legacy messages (assistant/user messages containing ``{{message_history}}`` + or ``{{response_to_evaluate}}``) are stripped from the config; the runner + was already created from the judge AI config (which carries the system + message), so only the plain-text evaluation payload is needed here. - return messages - - def _interpolate_message(self, content: str, variables: Dict[str, str]) -> str: - """ - Interpolates message content with variables using Mustache templating. - - :param content: The message content template - :param variables: Variables to interpolate - :return: Interpolated message content + :param input_text: The input text (message history) + :param output_text: The output text to evaluate + :return: Formatted evaluation input string """ - # Use chevron (Mustache) for templating, with no escaping - return chevron.render(content, variables) + _strip_legacy_judge_messages(self._ai_config.messages or []) + return f"MESSAGE HISTORY:\n{input_text}\n\nRESPONSE TO EVALUATE:\n{output_text}" def _parse_evaluation_response(self, data: Dict[str, Any]) -> Optional[Tuple[float, str]]: """ diff --git a/packages/sdk/server-ai/src/ldai/providers/runner.py b/packages/sdk/server-ai/src/ldai/providers/runner.py index 5e1b9abc..36f47915 100644 --- a/packages/sdk/server-ai/src/ldai/providers/runner.py +++ b/packages/sdk/server-ai/src/ldai/providers/runner.py @@ -16,13 +16,13 @@ class Runner(Protocol): async def run( self, - input: Any, + input: str, output_type: Optional[Dict[str, Any]] = None, ) -> RunnerResult: """ - Execute the runner with the given input. + Execute the runner with the given input string. - :param input: The input to the runner. + :param input: The string input to the runner. :param output_type: Optional JSON schema for structured output. :return: RunnerResult containing content, metrics, raw, and parsed fields. """ diff --git a/packages/sdk/server-ai/tests/test_judge.py b/packages/sdk/server-ai/tests/test_judge.py index 9833327d..d6e50c26 100644 --- a/packages/sdk/server-ai/tests/test_judge.py +++ b/packages/sdk/server-ai/tests/test_judge.py @@ -1,12 +1,12 @@ """Tests for Judge functionality.""" -from unittest.mock import AsyncMock, MagicMock +from unittest.mock import AsyncMock, MagicMock, call import pytest from ldclient import Config, Context, LDClient from ldclient.integrations.test_data import TestData -from ldai.judge import Judge +from ldai.judge import Judge, _strip_legacy_judge_messages from ldai.judge.evaluation_schema_builder import EvaluationSchemaBuilder from ldai.models import ( AIJudgeConfig, @@ -107,10 +107,73 @@ def judge_config_without_key(tracker) -> AIJudgeConfig: @pytest.fixture def judge_config_without_messages(tracker) -> AIJudgeConfig: - """Create a judge config without messages.""" + """Create a judge config without messages (None).""" return _make_judge_config(messages=None, tracker=tracker) +class TestStripLegacyJudgeMessages: + """Tests for the _strip_legacy_judge_messages helper.""" + + def test_strips_assistant_message_with_message_history(self): + """Non-system messages containing {{message_history}} should be removed.""" + messages = [ + LDMessage(role='system', content='You are a judge.'), + LDMessage(role='assistant', content='Here is the history: {{message_history}}'), + ] + result = _strip_legacy_judge_messages(messages) + assert len(result) == 1 + assert result[0].role == 'system' + + def test_strips_user_message_with_response_to_evaluate(self): + """Non-system messages containing {{response_to_evaluate}} should be removed.""" + messages = [ + LDMessage(role='system', content='You are a judge.'), + LDMessage(role='user', content='Evaluate: {{response_to_evaluate}}'), + ] + result = _strip_legacy_judge_messages(messages) + assert len(result) == 1 + assert result[0].role == 'system' + + def test_strips_all_legacy_messages(self): + """All non-system template messages should be stripped from a typical legacy config.""" + messages = [ + LDMessage(role='system', content='You are a judge.'), + LDMessage(role='assistant', content='{{message_history}}'), + LDMessage(role='user', content='{{response_to_evaluate}}'), + ] + result = _strip_legacy_judge_messages(messages) + assert len(result) == 1 + assert result[0].role == 'system' + + def test_does_not_strip_system_message_containing_template_vars(self): + """System messages are never stripped, even if they contain template variable names.""" + messages = [ + LDMessage(role='system', content='Judge using {{message_history}} and {{response_to_evaluate}}.'), + ] + result = _strip_legacy_judge_messages(messages) + assert len(result) == 1 + assert result[0].role == 'system' + + def test_does_not_strip_non_template_messages(self): + """Non-system messages without template variables are left untouched.""" + messages = [ + LDMessage(role='system', content='You are a judge.'), + LDMessage(role='user', content='This is a regular message.'), + ] + result = _strip_legacy_judge_messages(messages) + assert len(result) == 2 + + def test_returns_empty_list_for_empty_input(self): + """An empty input list should return an empty list.""" + assert _strip_legacy_judge_messages([]) == [] + + def test_new_style_config_system_only_unchanged(self): + """A new-style config with only a system message passes through unchanged.""" + messages = [LDMessage(role='system', content='You are a judge.')] + result = _strip_legacy_judge_messages(messages) + assert result == messages + + class TestJudgeInitialization: """Tests for Judge initialization.""" @@ -160,18 +223,104 @@ async def test_evaluate_returns_failure_when_evaluation_metric_key_missing( mock_runner.run.assert_not_called() @pytest.mark.asyncio - async def test_evaluate_returns_failure_when_messages_missing( - self, judge_config_without_messages: AIJudgeConfig, mock_runner + async def test_evaluate_succeeds_when_messages_is_none( + self, judge_config_without_messages: AIJudgeConfig, tracker: LDAIConfigTracker, mock_runner ): - """Evaluate should return a failed JudgeResult when messages are missing.""" - judge = Judge(judge_config_without_messages, mock_runner) + """Evaluate should proceed (not error early) when messages is None.""" + mock_response = RunnerResult( + content='', + metrics=LDAIMetrics(success=True), + parsed={'score': 0.7, 'reasoning': 'Acceptable response.'}, + ) + mock_runner.run.return_value = mock_response + tracker.track_metrics_of_async = AsyncMock(return_value=mock_response) + + config = _make_judge_config(messages=None, tracker=tracker) + judge = Judge(config, mock_runner) result = await judge.evaluate("input text", "output text") assert isinstance(result, JudgeResult) - assert result.success is False - assert result.sampled is False - mock_runner.run.assert_not_called() + assert result.sampled is True + + @pytest.mark.asyncio + async def test_evaluate_passes_string_input_to_runner( + self, judge_config_with_key: AIJudgeConfig, tracker: LDAIConfigTracker, mock_runner + ): + """runner.run() should receive the formatted string, NOT a message list.""" + mock_response = RunnerResult( + content='', + metrics=LDAIMetrics(success=True), + parsed={'score': 0.85, 'reasoning': 'Good answer.'}, + ) + mock_runner.run.return_value = mock_response + tracker.track_metrics_of_async = AsyncMock( + side_effect=lambda _metric_fn, fn: fn() + ) + + judge = Judge(judge_config_with_key, mock_runner) + await judge.evaluate("What is AI?", "AI is artificial intelligence.") + + mock_runner.run.assert_called_once() + call_args = mock_runner.run.call_args + input_arg = call_args[0][0] if call_args[0] else call_args[1].get('input') + assert isinstance(input_arg, str) + assert "MESSAGE HISTORY:\nWhat is AI?" in input_arg + assert "RESPONSE TO EVALUATE:\nAI is artificial intelligence." in input_arg + + @pytest.mark.asyncio + async def test_evaluate_string_input_format( + self, judge_config_with_key: AIJudgeConfig, tracker: LDAIConfigTracker, mock_runner + ): + """runner.run() should receive the exact expected string format.""" + mock_response = RunnerResult( + content='', + metrics=LDAIMetrics(success=True), + parsed={'score': 0.9, 'reasoning': 'Correct.'}, + ) + mock_runner.run.return_value = mock_response + tracker.track_metrics_of_async = AsyncMock( + side_effect=lambda _metric_fn, fn: fn() + ) + + judge = Judge(judge_config_with_key, mock_runner) + await judge.evaluate("hello", "world") + + call_args = mock_runner.run.call_args + input_arg = call_args[0][0] if call_args[0] else call_args[1].get('input') + expected = "MESSAGE HISTORY:\nhello\n\nRESPONSE TO EVALUATE:\nworld" + assert input_arg == expected + + @pytest.mark.asyncio + async def test_evaluate_legacy_config_strips_template_messages( + self, tracker: LDAIConfigTracker, mock_runner + ): + """Legacy config with assistant/user template messages: runner still gets string input.""" + legacy_messages = [ + LDMessage(role='system', content='You are a strict judge.'), + LDMessage(role='assistant', content='{{message_history}}'), + LDMessage(role='user', content='Evaluate: {{response_to_evaluate}}'), + ] + config = _make_judge_config(messages=legacy_messages, tracker=tracker) + + mock_response = RunnerResult( + content='', + metrics=LDAIMetrics(success=True), + parsed={'score': 0.75, 'reasoning': 'Mostly relevant.'}, + ) + mock_runner.run.return_value = mock_response + tracker.track_metrics_of_async = AsyncMock( + side_effect=lambda _metric_fn, fn: fn() + ) + + judge = Judge(config, mock_runner) + await judge.evaluate("input", "output") + + call_args = mock_runner.run.call_args + input_arg = call_args[0][0] if call_args[0] else call_args[1].get('input') + assert isinstance(input_arg, str) + assert "MESSAGE HISTORY:\ninput" in input_arg + assert "RESPONSE TO EVALUATE:\noutput" in input_arg @pytest.mark.asyncio async def test_evaluate_success_with_valid_response( From efaa5b778e6bd42ba12297d4553f76cba532791e Mon Sep 17 00:00:00 2001 From: jsonbailey Date: Tue, 5 May 2026 14:54:09 -0500 Subject: [PATCH 2/4] refactor: move legacy judge message stripping to client._judge_config() Co-Authored-By: Claude Sonnet 4.6 --- packages/sdk/server-ai/src/ldai/client.py | 6 +++++- packages/sdk/server-ai/src/ldai/judge/__init__.py | 13 ------------- packages/sdk/server-ai/tests/test_judge.py | 8 ++++++-- 3 files changed, 11 insertions(+), 16 deletions(-) diff --git a/packages/sdk/server-ai/src/ldai/client.py b/packages/sdk/server-ai/src/ldai/client.py index de60c0d2..d4a10bb2 100644 --- a/packages/sdk/server-ai/src/ldai/client.py +++ b/packages/sdk/server-ai/src/ldai/client.py @@ -9,7 +9,7 @@ from ldai import log from ldai.agent_graph import AgentGraphDefinition from ldai.evaluator import Evaluator -from ldai.judge import Judge +from ldai.judge import Judge, _strip_legacy_judge_messages from ldai.managed_agent import ManagedAgent from ldai.managed_agent_graph import ManagedAgentGraph from ldai.managed_model import ManagedModel @@ -237,6 +237,10 @@ def _extract_evaluation_metric_key(variation: Dict[str, Any]) -> Optional[str]: evaluation_metric_key = _extract_evaluation_metric_key(variation) + # strip legacy judge template messages before creating config + if messages: + messages = _strip_legacy_judge_messages(messages) + config = AIJudgeConfig( key=key, enabled=bool(enabled), diff --git a/packages/sdk/server-ai/src/ldai/judge/__init__.py b/packages/sdk/server-ai/src/ldai/judge/__init__.py index 3a83152d..a2927016 100644 --- a/packages/sdk/server-ai/src/ldai/judge/__init__.py +++ b/packages/sdk/server-ai/src/ldai/judge/__init__.py @@ -160,19 +160,6 @@ def get_model_runner(self) -> Runner: return self._model_runner def _build_evaluation_input(self, input_text: str, output_text: str) -> str: - """ - Build the string input for the judge runner. - - Legacy messages (assistant/user messages containing ``{{message_history}}`` - or ``{{response_to_evaluate}}``) are stripped from the config; the runner - was already created from the judge AI config (which carries the system - message), so only the plain-text evaluation payload is needed here. - - :param input_text: The input text (message history) - :param output_text: The output text to evaluate - :return: Formatted evaluation input string - """ - _strip_legacy_judge_messages(self._ai_config.messages or []) return f"MESSAGE HISTORY:\n{input_text}\n\nRESPONSE TO EVALUATE:\n{output_text}" def _parse_evaluation_response(self, data: Dict[str, Any]) -> Optional[Tuple[float, str]]: diff --git a/packages/sdk/server-ai/tests/test_judge.py b/packages/sdk/server-ai/tests/test_judge.py index d6e50c26..e1c6b902 100644 --- a/packages/sdk/server-ai/tests/test_judge.py +++ b/packages/sdk/server-ai/tests/test_judge.py @@ -292,10 +292,14 @@ async def test_evaluate_string_input_format( assert input_arg == expected @pytest.mark.asyncio - async def test_evaluate_legacy_config_strips_template_messages( + async def test_evaluate_legacy_config_passes_string_input_to_runner( self, tracker: LDAIConfigTracker, mock_runner ): - """Legacy config with assistant/user template messages: runner still gets string input.""" + """ + Judge built directly with legacy messages (bypassing the client) still passes + a formatted string to the runner. Legacy message stripping is the client's + responsibility; the Judge itself does not strip. + """ legacy_messages = [ LDMessage(role='system', content='You are a strict judge.'), LDMessage(role='assistant', content='{{message_history}}'), From 1e8194f3a4547b79301eab930d44dd8f7f003be2 Mon Sep 17 00:00:00 2001 From: jsonbailey Date: Tue, 5 May 2026 15:10:30 -0500 Subject: [PATCH 3/4] fix: runners prepend config messages; managed layer passes plain str to run() Co-Authored-By: Claude Sonnet 4.6 --- .../src/ldai_langchain/langchain_model_runner.py | 8 ++++++-- .../src/ldai_langchain/langchain_runner_factory.py | 3 ++- .../src/ldai_openai/openai_model_runner.py | 7 ++++++- .../src/ldai_openai/openai_runner_factory.py | 3 ++- packages/sdk/server-ai/src/ldai/managed_model.py | 5 +---- 5 files changed, 17 insertions(+), 9 deletions(-) diff --git a/packages/ai-providers/server-ai-langchain/src/ldai_langchain/langchain_model_runner.py b/packages/ai-providers/server-ai-langchain/src/ldai_langchain/langchain_model_runner.py index 18e542dc..6b0fc240 100644 --- a/packages/ai-providers/server-ai-langchain/src/ldai_langchain/langchain_model_runner.py +++ b/packages/ai-providers/server-ai-langchain/src/ldai_langchain/langchain_model_runner.py @@ -24,8 +24,9 @@ class LangChainModelRunner(Runner): :meth:`run`. """ - def __init__(self, llm: BaseChatModel): + def __init__(self, llm: BaseChatModel, config_messages: Optional[List[LDMessage]] = None): self._llm = llm + self._config_messages: List[LDMessage] = list(config_messages or []) def get_llm(self) -> BaseChatModel: """ @@ -43,6 +44,9 @@ async def run( """ Run the LangChain model with the given input. + Prepends any config messages (system prompt, instructions, etc.) stored + at construction time before the user message. + :param input: A string prompt :param output_type: Optional JSON schema dict requesting structured output. When provided, ``parsed`` on the returned :class:`RunnerResult` is @@ -50,7 +54,7 @@ async def run( :return: :class:`RunnerResult` containing ``content``, ``metrics``, ``raw`` and (when ``output_type`` is set) ``parsed``. """ - messages = [LDMessage(role='user', content=input)] + messages = self._config_messages + [LDMessage(role='user', content=input)] if output_type is not None: return await self._run_structured(messages, output_type) diff --git a/packages/ai-providers/server-ai-langchain/src/ldai_langchain/langchain_runner_factory.py b/packages/ai-providers/server-ai-langchain/src/ldai_langchain/langchain_runner_factory.py index 94c427a0..f53f9f00 100644 --- a/packages/ai-providers/server-ai-langchain/src/ldai_langchain/langchain_runner_factory.py +++ b/packages/ai-providers/server-ai-langchain/src/ldai_langchain/langchain_runner_factory.py @@ -69,4 +69,5 @@ def create_model(self, config: AIConfigKind) -> LangChainModelRunner: :return: LangChainModelRunner ready to invoke the model """ llm = create_langchain_model(config) - return LangChainModelRunner(llm) + config_messages = list(getattr(config, 'messages', None) or []) + return LangChainModelRunner(llm, config_messages) diff --git a/packages/ai-providers/server-ai-openai/src/ldai_openai/openai_model_runner.py b/packages/ai-providers/server-ai-openai/src/ldai_openai/openai_model_runner.py index f04f6c12..1ef775d4 100644 --- a/packages/ai-providers/server-ai-openai/src/ldai_openai/openai_model_runner.py +++ b/packages/ai-providers/server-ai-openai/src/ldai_openai/openai_model_runner.py @@ -28,10 +28,12 @@ def __init__( client: AsyncOpenAI, model_name: str, parameters: Dict[str, Any], + config_messages: Optional[List[LDMessage]] = None, ): self._client = client self._model_name = model_name self._parameters = parameters + self._config_messages: List[LDMessage] = list(config_messages or []) async def run( self, @@ -41,6 +43,9 @@ async def run( """ Run the OpenAI model with the given input. + Prepends any config messages (system prompt, instructions, etc.) stored + at construction time before the user message. + :param input: A string prompt :param output_type: Optional JSON schema dict requesting structured output. When provided, ``parsed`` on the returned :class:`RunnerResult` is @@ -48,7 +53,7 @@ async def run( :return: :class:`RunnerResult` containing ``content``, ``metrics``, ``raw`` and (when ``output_type`` is set) ``parsed``. """ - messages = [LDMessage(role='user', content=input)] + messages = self._config_messages + [LDMessage(role='user', content=input)] if output_type is not None: return await self._run_structured(messages, output_type) diff --git a/packages/ai-providers/server-ai-openai/src/ldai_openai/openai_runner_factory.py b/packages/ai-providers/server-ai-openai/src/ldai_openai/openai_runner_factory.py index 93d55770..01f8a166 100644 --- a/packages/ai-providers/server-ai-openai/src/ldai_openai/openai_runner_factory.py +++ b/packages/ai-providers/server-ai-openai/src/ldai_openai/openai_runner_factory.py @@ -100,7 +100,8 @@ def create_model(self, config: AIConfigKind) -> OpenAIModelRunner: tool_defs = parameters.pop('tools', None) or [] if tool_defs: parameters['tools'] = normalize_tool_types(tool_defs) - return OpenAIModelRunner(self._client, model_name, parameters) + config_messages = list(getattr(config, 'messages', None) or []) + return OpenAIModelRunner(self._client, model_name, parameters, config_messages) def get_client(self) -> AsyncOpenAI: """ diff --git a/packages/sdk/server-ai/src/ldai/managed_model.py b/packages/sdk/server-ai/src/ldai/managed_model.py index 04db0d77..8f85709e 100644 --- a/packages/sdk/server-ai/src/ldai/managed_model.py +++ b/packages/sdk/server-ai/src/ldai/managed_model.py @@ -43,12 +43,9 @@ async def run(self, prompt: str) -> ManagedResult: user_message = LDMessage(role='user', content=prompt) self._messages.append(user_message) - config_messages = self._ai_config.messages or [] - all_messages = config_messages + self._messages - result: RunnerResult = await tracker.track_metrics_of_async( lambda r: r.metrics, - lambda: self._model_runner.run(all_messages), + lambda: self._model_runner.run(prompt), ) assistant_message = LDMessage(role='assistant', content=result.content) From 133bc10860b88e2ef6cc2040b708a60323546162 Mon Sep 17 00:00:00 2001 From: jsonbailey Date: Tue, 5 May 2026 15:14:10 -0500 Subject: [PATCH 4/4] fix: update runner tests to pass str to run() instead of list[LDMessage] Co-Authored-By: Claude Sonnet 4.6 --- .../tests/test_langchain_provider.py | 15 ++++------- .../tests/test_openai_provider.py | 26 ++++++------------- 2 files changed, 13 insertions(+), 28 deletions(-) diff --git a/packages/ai-providers/server-ai-langchain/tests/test_langchain_provider.py b/packages/ai-providers/server-ai-langchain/tests/test_langchain_provider.py index d6cf83f6..a0ee1763 100644 --- a/packages/ai-providers/server-ai-langchain/tests/test_langchain_provider.py +++ b/packages/ai-providers/server-ai-langchain/tests/test_langchain_provider.py @@ -233,8 +233,7 @@ async def test_returns_success_true_for_string_content(self, mock_llm): mock_llm.ainvoke = AsyncMock(return_value=mock_response) provider = LangChainModelRunner(mock_llm) - messages = [LDMessage(role='user', content='Hello')] - result = await provider.run(messages) + result = await provider.run('Hello') assert result.metrics.success is True assert result.content == 'Test response' @@ -246,8 +245,7 @@ async def test_returns_success_false_for_non_string_content_and_logs_warning(sel mock_llm.ainvoke = AsyncMock(return_value=mock_response) provider = LangChainModelRunner(mock_llm) - messages = [LDMessage(role='user', content='Hello')] - result = await provider.run(messages) + result = await provider.run('Hello') assert result.metrics.success is False assert result.content == '' @@ -259,8 +257,7 @@ async def test_returns_success_false_when_model_invocation_throws_error(self, mo mock_llm.ainvoke = AsyncMock(side_effect=error) provider = LangChainModelRunner(mock_llm) - messages = [LDMessage(role='user', content='Hello')] - result = await provider.run(messages) + result = await provider.run('Hello') assert result.metrics.success is False assert result.content == '' @@ -284,9 +281,8 @@ async def test_returns_success_true_for_successful_invocation(self, mock_llm): mock_llm.with_structured_output = MagicMock(return_value=mock_structured_llm) provider = LangChainModelRunner(mock_llm) - messages = [LDMessage(role='user', content='Hello')] response_structure = {'type': 'object', 'properties': {}} - result = await provider.run(messages, output_type=response_structure) + result = await provider.run('Hello', output_type=response_structure) assert result.metrics.success is True assert result.parsed == parsed_data @@ -300,9 +296,8 @@ async def test_returns_success_false_when_structured_model_invocation_throws_err mock_llm.with_structured_output = MagicMock(return_value=mock_structured_llm) provider = LangChainModelRunner(mock_llm) - messages = [LDMessage(role='user', content='Hello')] response_structure = {'type': 'object', 'properties': {}} - result = await provider.run(messages, output_type=response_structure) + result = await provider.run('Hello', output_type=response_structure) assert result.metrics.success is False assert result.parsed is None diff --git a/packages/ai-providers/server-ai-openai/tests/test_openai_provider.py b/packages/ai-providers/server-ai-openai/tests/test_openai_provider.py index 3b69d3f6..4a1eb5f8 100644 --- a/packages/ai-providers/server-ai-openai/tests/test_openai_provider.py +++ b/packages/ai-providers/server-ai-openai/tests/test_openai_provider.py @@ -4,8 +4,6 @@ from typing import Any from unittest.mock import AsyncMock, MagicMock, patch -from ldai import LDMessage - from ldai_openai import OpenAIModelRunner, OpenAIRunnerFactory, get_ai_metrics_from_response, get_ai_usage_from_response @@ -143,8 +141,7 @@ async def test_invokes_openai_chat_completions_and_returns_response(self, mock_c mock_client.chat.completions.create = AsyncMock(return_value=mock_response) provider = OpenAIModelRunner(mock_client, 'gpt-3.5-turbo', {}) - messages = [LDMessage(role='user', content='Hello!')] - result = await provider.run(messages) + result = await provider.run('Hello!') mock_client.chat.completions.create.assert_called_once_with( model='gpt-3.5-turbo', @@ -172,8 +169,7 @@ async def test_returns_unsuccessful_response_when_no_content(self, mock_client): mock_client.chat.completions.create = AsyncMock(return_value=mock_response) provider = OpenAIModelRunner(mock_client, 'gpt-3.5-turbo', {}) - messages = [LDMessage(role='user', content='Hello!')] - result = await provider.run(messages) + result = await provider.run('Hello!') assert result.content == '' assert result.metrics.success is False @@ -190,8 +186,7 @@ async def test_returns_unsuccessful_response_when_choices_empty(self, mock_clien mock_client.chat.completions.create = AsyncMock(return_value=mock_response) provider = OpenAIModelRunner(mock_client, 'gpt-3.5-turbo', {}) - messages = [LDMessage(role='user', content='Hello!')] - result = await provider.run(messages) + result = await provider.run('Hello!') assert result.content == '' assert result.metrics.success is False @@ -204,8 +199,7 @@ async def test_returns_unsuccessful_response_when_exception_thrown(self, mock_cl mock_client.chat.completions.create = AsyncMock(side_effect=Exception('API Error')) provider = OpenAIModelRunner(mock_client, 'gpt-3.5-turbo', {}) - messages = [LDMessage(role='user', content='Hello!')] - result = await provider.run(messages) + result = await provider.run('Hello!') assert result.content == '' assert result.metrics.success is False @@ -234,7 +228,6 @@ async def test_invokes_openai_with_structured_output(self, mock_client): mock_client.chat.completions.create = AsyncMock(return_value=mock_response) provider = OpenAIModelRunner(mock_client, 'gpt-3.5-turbo', {}) - messages = [LDMessage(role='user', content='Tell me about a person')] response_structure = { 'type': 'object', 'properties': { @@ -245,7 +238,7 @@ async def test_invokes_openai_with_structured_output(self, mock_client): 'required': ['name', 'age', 'city'], } - result = await provider.run(messages, output_type=response_structure) + result = await provider.run('Tell me about a person', output_type=response_structure) assert result.parsed == {'name': 'John', 'age': 30, 'city': 'New York'} assert result.content == '{"name": "John", "age": 30, "city": "New York"}' @@ -269,10 +262,9 @@ async def test_returns_unsuccessful_when_no_content_in_structured_response(self, mock_client.chat.completions.create = AsyncMock(return_value=mock_response) provider = OpenAIModelRunner(mock_client, 'gpt-3.5-turbo', {}) - messages = [LDMessage(role='user', content='Tell me about a person')] response_structure = {'type': 'object'} - result = await provider.run(messages, output_type=response_structure) + result = await provider.run('Tell me about a person', output_type=response_structure) assert result.parsed is None assert result.content == '' @@ -293,10 +285,9 @@ async def test_handles_json_parsing_errors(self, mock_client): mock_client.chat.completions.create = AsyncMock(return_value=mock_response) provider = OpenAIModelRunner(mock_client, 'gpt-3.5-turbo', {}) - messages = [LDMessage(role='user', content='Tell me about a person')] response_structure = {'type': 'object'} - result = await provider.run(messages, output_type=response_structure) + result = await provider.run('Tell me about a person', output_type=response_structure) assert result.parsed is None assert result.content == 'invalid json content' @@ -312,10 +303,9 @@ async def test_returns_unsuccessful_response_when_exception_thrown(self, mock_cl mock_client.chat.completions.create = AsyncMock(side_effect=Exception('API Error')) provider = OpenAIModelRunner(mock_client, 'gpt-3.5-turbo', {}) - messages = [LDMessage(role='user', content='Tell me about a person')] response_structure = {'type': 'object'} - result = await provider.run(messages, output_type=response_structure) + result = await provider.run('Tell me about a person', output_type=response_structure) assert result.parsed is None assert result.content == ''