fix(components): Add relevant component and pipeline inputs/outputs t…

…o support creating ModelEvaluations as part of the AutoSxS Metrics component PiperOrigin-RevId: 615675169
kubeflow · Mar 14, 2024 · 2abe91e · 2abe91e
1 parent a0f3815
commit 2abe91e
Show file tree

Hide file tree

Showing 4 changed files with 68 additions and 5 deletions.
diff --git a/components/google-cloud/RELEASE.md b/components/google-cloud/RELEASE.md
@@ -6,6 +6,7 @@
 * Use `eval_dataset` for train-time evalutation when training a reward model. Requires `eval_dataset` to contain the same fields as the [preference dataset](https://cloud.google.com/vertex-ai/docs/generative-ai/models/tune-text-models-rlhf#human-preference-dataset).
 * Update the documentation of `GetModel`.
 * Add CMEK support to `preview.model_evaluation.autosxs_pipeline`.
+* Updated component and pipeline inputs/outputs to support creating ModelEvaluations for ModelRegistry models in the AutoSxS pipeline.
 
 ## Release 2.10.0
 * Fix the missing output of pipeline remote runner. `AutoMLImageTrainingJobRunOp` now passes the model artifacts correctly to downstream components.

diff --git a/.../google_cloud_pipeline_components/_implementation/llm/generated/refined_image_versions.py b/.../google_cloud_pipeline_components/_implementation/llm/generated/refined_image_versions.py
@@ -17,4 +17,4 @@
 DO NOT EDIT - This file is generated, manual changes will be overridden.
 """
 
-IMAGE_TAG = '20240310_1707'
+IMAGE_TAG = '20240313_1707'
diff --git a/...loud_pipeline_components/_implementation/llm/model_evaluation_text_generation_pairwise.py b/...loud_pipeline_components/_implementation/llm/model_evaluation_text_generation_pairwise.py
@@ -33,26 +33,48 @@ def model_evaluation_text_generation_pairwise(
     judgments_dir: str,
     autosxs_metrics: dsl.Output[dsl.Metrics],  # pylint: disable=unused-argument # pytype: disable=unsupported-operands
     gcp_resources: dsl.OutputPath(str),  # pytype: disable=invalid-annotation
+    model_a_evaluation_path: dsl.OutputPath(str),  # pylint: disable=unused-argument # pytype: disable=unsupported-operands
+    model_b_evaluation_path: dsl.OutputPath(str),  # pylint: disable=unused-argument # pytype: disable=unsupported-operands
+    evaluation_count_path: dsl.OutputPath(int),  # pylint: disable=unused-argument # pytype: disable=unsupported-operands
+    evaluation_dataset_path: dsl.OutputPath(str),  # pylint: disable=unused-argument # pytype: disable=unsupported-operands
     human_preference_column: str = '',
     project: str = _placeholders.PROJECT_ID_PLACEHOLDER,
     location: str = _placeholders.LOCATION_PLACEHOLDER,
     encryption_spec_key_name: str = '',
+    model_a: str = '',
+    model_b: str = '',
+    evaluation_dataset: str = '',
+    evaluation_dataset_metadata: str = '',  # pylint: disable=unused-argument
+    task: str = '',
 ) -> dsl.ContainerSpec:  # pylint: disable=g-doc-args
   """Compute AutoSXS metrics using judgments outputs from Arbiter.
 
   Args:
-    judgments_dir: Path where store the Judgments.
+    judgments_dir: Path to store the Judgments.
     human_preference_column: The column containing ground truths. The default
       value is an empty string if not be provided by users.
     project: Project to upload evaluation metrics to.
     location: Location to upload evaluation metrics to.
     encryption_spec_key_name: Customer-managed encryption key options. If this
       is set, then all resources created by the component will be encrypted with
       the provided encryption key.
+    model_a: Resource path for Model A.
+    model_b: Resource path for Model B.
+    evaluation_dataset: Path to the evaluation dataset.
+    evaluation_dataset_metadata: AutoSxS metrics metadata json string.
+    task: Task that was used for this AutoSxS run.
 
   Returns:
     autosxs_metrics: Autosxs win rate metrics and human alignment metrics.
     gcp_resources: Tracker for GCP resources created by this component.
+    model_a_evaluation_path: Path to write the ModelEvaluation for Model A if it
+    is a
+      ModelRegistry model.
+    model_b_evaluation: Path to write the ModelEvaluation for Model B if it is a
+      ModelRegistry model.
+    evaluation_count: Path to write the EvaluationCount number to.
+    evaluation_dataset_path: Path to write the path to the evaluation dataset.
+      This is needed because Pipeline outputs must be component outputs.
   """
   return gcpc_utils.build_serverless_customjob_container_spec(
       project=project,
@@ -69,6 +91,15 @@ def model_evaluation_text_generation_pairwise(
               f'--project={project}',
               f'--location={location}',
               '--executor_input={{$.json_escape[1]}}',
+              f'--model_a={model_a}',
+              f'--model_b={model_b}',
+              f'--model_a_evaluation_path={model_a_evaluation_path}',
+              f'--model_b_evaluation_path={model_b_evaluation_path}',
+              f'--evaluation_count_path={evaluation_count_path}',
+              f'--evaluation_dataset_path={evaluation_dataset_path}',
+              f'--evaluation_dataset={evaluation_dataset}',
+              "--evaluation_dataset_metadata={{$.inputs.parameters['evaluation_dataset_metadata'].json_escape[0]}}",
+              f'--task={task}',
           ],
           encryption_spec_key_name=encryption_spec_key_name,
       ),

diff --git a/...omponents/preview/model_evaluation/model_based_llm_evaluation/autosxs/autosxs_pipeline.py b/...omponents/preview/model_evaluation/model_based_llm_evaluation/autosxs/autosxs_pipeline.py
@@ -13,14 +13,22 @@
 # limitations under the License.
 """Optimization AI Inference and AutoSxS pipeline function."""
 
-from typing import Any, Dict, List
+from typing import Any, Dict, List, NamedTuple
 
 from google_cloud_pipeline_components import _placeholders
 from google_cloud_pipeline_components._implementation.llm import batch_prediction_pairwise
 from google_cloud_pipeline_components._implementation.llm import model_evaluation_text_generation_pairwise
 from google_cloud_pipeline_components._implementation.llm import online_evaluation_pairwise
 from kfp import dsl
 
+PipelineOutput = NamedTuple(
+    'Outputs',
+    model_a_evaluation_resource_name=str,
+    model_b_evaluation_resource_name=str,
+    evaluation_count=int,
+    evaluation_dataset_path=str,
+)
+
 
 # pylint: disable=dangerous-default-value,g-bare-generic,unused-argument
 @dsl.pipeline(
@@ -47,7 +55,7 @@ def autosxs_pipeline(
     bigquery_destination_prefix: str = '',
     experimental_args: Dict[str, Any] = {},
     encryption_spec_key_name: str = '',
-):
+) -> PipelineOutput:
   # fmt: off
   """Evaluates two models side-by-side using an arbiter model.
 
@@ -71,6 +79,12 @@ def autosxs_pipeline(
     bigquery_destination_prefix: BigQuery table to write judgments to if the specified format is 'bigquery'.
     experimental_args: Experimentally released arguments. Subject to change.
     encryption_spec_key_name: Customer-managed encryption key options. If this is set, then all resources created by the pipeline will be encrypted with the provided encryption key.
+
+  Returns:
+    model_a_evaluation_resource_name: The path to write the ModelEvaluation for Model A to if Model A is a ModelRegistry Model.
+    model_b_evaluation_resource_name: The path to write the ModelEvaluation for Model B to if Model B is a ModelRegistry Model.
+    evaluation_count: The count of how many evaluations were included for this AutoSxS run.
+    evaluation_dataset_path: The path to the overall evaluation dataset including judgments.
   """
   # fmt: on
   responses = batch_prediction_pairwise.batch_prediction_pairwise(
@@ -109,12 +123,29 @@ def autosxs_pipeline(
       encryption_spec_key_name=encryption_spec_key_name,
   ).set_display_name('AutoSxS Autorater')
 
-  model_evaluation_text_generation_pairwise.model_evaluation_text_generation_pairwise(
+  metrics = model_evaluation_text_generation_pairwise.model_evaluation_text_generation_pairwise(
       judgments_dir=winners.outputs['judgments_uri'],
       human_preference_column=human_preference_column,
       project=project,
       location=location,
       encryption_spec_key_name=encryption_spec_key_name,
+      model_a=model_a,
+      model_b=model_b,
+      evaluation_dataset=evaluation_dataset,
+      evaluation_dataset_metadata=winners.outputs['metadata'],
+      task=task,
   ).set_display_name(
       'AutoSxS Metrics'
   )
+
+  return PipelineOutput(
+      model_a_evaluation_resource_name=metrics.outputs[
+          'model_a_evaluation_path'
+      ],
+      model_b_evaluation_resource_name=metrics.outputs[
+          'model_b_evaluation_path'
+      ],
+      evaluation_count=metrics.outputs['evaluation_count_path'],
+      # Needs to be a component output
+      evaluation_dataset_path=metrics.outputs['evaluation_dataset_path'],
+  )