diff --git a/packages/sdk/server-ai/src/ldai/judge/__init__.py b/packages/sdk/server-ai/src/ldai/judge/__init__.py index f05756b5..6db89f68 100644 --- a/packages/sdk/server-ai/src/ldai/judge/__init__.py +++ b/packages/sdk/server-ai/src/ldai/judge/__init__.py @@ -51,7 +51,7 @@ async def evaluate( :param input_text: The input prompt or question that was provided to the AI :param output_text: The AI-generated response to be evaluated :param sampling_rate: Sampling rate (0-1) to determine if evaluation should be processed (defaults to 1) - :return: Evaluation result; ``sampled=True`` when skipped due to sampling rate + :return: The result of the judge evaluation. """ judge_result = JudgeResult(judge_config_key=self._ai_config.key) @@ -70,9 +70,9 @@ async def evaluate( if random.random() > sampling_rate: log.debug(f'Judge evaluation skipped due to sampling rate: {sampling_rate}') - judge_result.sampled = True return judge_result + judge_result.sampled = True messages = self._construct_evaluation_messages(input_text, output_text) assert self._evaluation_response_structure is not None @@ -110,7 +110,7 @@ async def evaluate_messages( :param messages: Array of messages representing the conversation history :param response: The AI response to be evaluated :param sampling_ratio: Sampling ratio (0-1) to determine if evaluation should be processed (defaults to 1) - :return: Evaluation result; ``sampled=True`` when skipped due to sampling rate + :return: The result of the judge evaluation. """ input_text = '\r\n'.join([msg.content for msg in messages]) if messages else '' output_text = response.message.content diff --git a/packages/sdk/server-ai/src/ldai/providers/types.py b/packages/sdk/server-ai/src/ldai/providers/types.py index 4ad626ba..083141d6 100644 --- a/packages/sdk/server-ai/src/ldai/providers/types.py +++ b/packages/sdk/server-ai/src/ldai/providers/types.py @@ -65,10 +65,10 @@ class JudgeResult: judge_config_key: Optional[str] = None success: bool = False error_message: Optional[str] = None - sampled: bool = False # True when the judge was skipped due to sampling rate + sampled: bool = False # True when the evaluation was sampled and run + metric_key: Optional[str] = None score: Optional[float] = None reasoning: Optional[str] = None - metric_key: Optional[str] = None def to_dict(self) -> Dict[str, Any]: """ diff --git a/packages/sdk/server-ai/src/ldai/tracker.py b/packages/sdk/server-ai/src/ldai/tracker.py index b1894f40..e071ae1c 100644 --- a/packages/sdk/server-ai/src/ldai/tracker.py +++ b/packages/sdk/server-ai/src/ldai/tracker.py @@ -253,6 +253,9 @@ def track_judge_result(self, judge_result: Any, *, graph_key: Optional[str] = No :param judge_result: JudgeResult object containing score, metric key, and success status :param graph_key: When set, include ``graphKey`` in the event payload. """ + if not judge_result.sampled: + return + if judge_result.success and judge_result.metric_key: track_data = self.__get_track_data(graph_key=graph_key) if judge_result.judge_config_key: diff --git a/packages/sdk/server-ai/tests/test_judge.py b/packages/sdk/server-ai/tests/test_judge.py index 76bccd0f..b4922d61 100644 --- a/packages/sdk/server-ai/tests/test_judge.py +++ b/packages/sdk/server-ai/tests/test_judge.py @@ -168,6 +168,7 @@ async def test_evaluate_success_with_valid_response( assert isinstance(result, JudgeResult) assert result.success is True + assert result.sampled is True assert result.metric_key == '$ld:ai:judge:relevance' assert result.score == 0.85 assert result.reasoning is not None @@ -194,6 +195,7 @@ async def test_evaluate_success_with_evaluation_response_shape( assert isinstance(result, JudgeResult) assert result.success is True + assert result.sampled is True assert result.metric_key == '$ld:ai:judge:relevance' assert result.score == 0.9 assert result.reasoning is not None @@ -288,13 +290,13 @@ async def test_evaluate_handles_exception( async def test_evaluate_respects_sampling_rate( self, judge_config_with_key: AIJudgeConfig, tracker: LDAIConfigTracker, mock_runner ): - """Evaluate should return sampled=True when skipped due to sampling rate.""" + """Evaluate should return sampled=False when skipped due to sampling rate.""" judge = Judge(judge_config_with_key, tracker, mock_runner) result = await judge.evaluate("input", "output", sampling_rate=0.0) assert isinstance(result, JudgeResult) - assert result.sampled is True + assert result.sampled is False assert result.success is False mock_runner.invoke_structured_model.assert_not_called()