Skip to content

Commit

Permalink
fix(components): Add relevant component and pipeline inputs/outputs t…
Browse files Browse the repository at this point in the history
…o support creating ModelEvaluations as part of the AutoSxS Metrics component

PiperOrigin-RevId: 615675169
  • Loading branch information
Googler committed Mar 14, 2024
1 parent a0f3815 commit 2abe91e
Show file tree
Hide file tree
Showing 4 changed files with 68 additions and 5 deletions.
1 change: 1 addition & 0 deletions components/google-cloud/RELEASE.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
* Use `eval_dataset` for train-time evalutation when training a reward model. Requires `eval_dataset` to contain the same fields as the [preference dataset](https://cloud.google.com/vertex-ai/docs/generative-ai/models/tune-text-models-rlhf#human-preference-dataset).
* Update the documentation of `GetModel`.
* Add CMEK support to `preview.model_evaluation.autosxs_pipeline`.
* Updated component and pipeline inputs/outputs to support creating ModelEvaluations for ModelRegistry models in the AutoSxS pipeline.

## Release 2.10.0
* Fix the missing output of pipeline remote runner. `AutoMLImageTrainingJobRunOp` now passes the model artifacts correctly to downstream components.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,4 +17,4 @@
DO NOT EDIT - This file is generated, manual changes will be overridden.
"""

IMAGE_TAG = '20240310_1707'
IMAGE_TAG = '20240313_1707'
Original file line number Diff line number Diff line change
Expand Up @@ -33,26 +33,48 @@ def model_evaluation_text_generation_pairwise(
judgments_dir: str,
autosxs_metrics: dsl.Output[dsl.Metrics], # pylint: disable=unused-argument # pytype: disable=unsupported-operands
gcp_resources: dsl.OutputPath(str), # pytype: disable=invalid-annotation
model_a_evaluation_path: dsl.OutputPath(str), # pylint: disable=unused-argument # pytype: disable=unsupported-operands
model_b_evaluation_path: dsl.OutputPath(str), # pylint: disable=unused-argument # pytype: disable=unsupported-operands
evaluation_count_path: dsl.OutputPath(int), # pylint: disable=unused-argument # pytype: disable=unsupported-operands
evaluation_dataset_path: dsl.OutputPath(str), # pylint: disable=unused-argument # pytype: disable=unsupported-operands
human_preference_column: str = '',
project: str = _placeholders.PROJECT_ID_PLACEHOLDER,
location: str = _placeholders.LOCATION_PLACEHOLDER,
encryption_spec_key_name: str = '',
model_a: str = '',
model_b: str = '',
evaluation_dataset: str = '',
evaluation_dataset_metadata: str = '', # pylint: disable=unused-argument
task: str = '',
) -> dsl.ContainerSpec: # pylint: disable=g-doc-args
"""Compute AutoSXS metrics using judgments outputs from Arbiter.
Args:
judgments_dir: Path where store the Judgments.
judgments_dir: Path to store the Judgments.
human_preference_column: The column containing ground truths. The default
value is an empty string if not be provided by users.
project: Project to upload evaluation metrics to.
location: Location to upload evaluation metrics to.
encryption_spec_key_name: Customer-managed encryption key options. If this
is set, then all resources created by the component will be encrypted with
the provided encryption key.
model_a: Resource path for Model A.
model_b: Resource path for Model B.
evaluation_dataset: Path to the evaluation dataset.
evaluation_dataset_metadata: AutoSxS metrics metadata json string.
task: Task that was used for this AutoSxS run.
Returns:
autosxs_metrics: Autosxs win rate metrics and human alignment metrics.
gcp_resources: Tracker for GCP resources created by this component.
model_a_evaluation_path: Path to write the ModelEvaluation for Model A if it
is a
ModelRegistry model.
model_b_evaluation: Path to write the ModelEvaluation for Model B if it is a
ModelRegistry model.
evaluation_count: Path to write the EvaluationCount number to.
evaluation_dataset_path: Path to write the path to the evaluation dataset.
This is needed because Pipeline outputs must be component outputs.
"""
return gcpc_utils.build_serverless_customjob_container_spec(
project=project,
Expand All @@ -69,6 +91,15 @@ def model_evaluation_text_generation_pairwise(
f'--project={project}',
f'--location={location}',
'--executor_input={{$.json_escape[1]}}',
f'--model_a={model_a}',
f'--model_b={model_b}',
f'--model_a_evaluation_path={model_a_evaluation_path}',
f'--model_b_evaluation_path={model_b_evaluation_path}',
f'--evaluation_count_path={evaluation_count_path}',
f'--evaluation_dataset_path={evaluation_dataset_path}',
f'--evaluation_dataset={evaluation_dataset}',
"--evaluation_dataset_metadata={{$.inputs.parameters['evaluation_dataset_metadata'].json_escape[0]}}",
f'--task={task}',
],
encryption_spec_key_name=encryption_spec_key_name,
),
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,14 +13,22 @@
# limitations under the License.
"""Optimization AI Inference and AutoSxS pipeline function."""

from typing import Any, Dict, List
from typing import Any, Dict, List, NamedTuple

from google_cloud_pipeline_components import _placeholders
from google_cloud_pipeline_components._implementation.llm import batch_prediction_pairwise
from google_cloud_pipeline_components._implementation.llm import model_evaluation_text_generation_pairwise
from google_cloud_pipeline_components._implementation.llm import online_evaluation_pairwise
from kfp import dsl

PipelineOutput = NamedTuple(
'Outputs',
model_a_evaluation_resource_name=str,
model_b_evaluation_resource_name=str,
evaluation_count=int,
evaluation_dataset_path=str,
)


# pylint: disable=dangerous-default-value,g-bare-generic,unused-argument
@dsl.pipeline(
Expand All @@ -47,7 +55,7 @@ def autosxs_pipeline(
bigquery_destination_prefix: str = '',
experimental_args: Dict[str, Any] = {},
encryption_spec_key_name: str = '',
):
) -> PipelineOutput:
# fmt: off
"""Evaluates two models side-by-side using an arbiter model.
Expand All @@ -71,6 +79,12 @@ def autosxs_pipeline(
bigquery_destination_prefix: BigQuery table to write judgments to if the specified format is 'bigquery'.
experimental_args: Experimentally released arguments. Subject to change.
encryption_spec_key_name: Customer-managed encryption key options. If this is set, then all resources created by the pipeline will be encrypted with the provided encryption key.
Returns:
model_a_evaluation_resource_name: The path to write the ModelEvaluation for Model A to if Model A is a ModelRegistry Model.
model_b_evaluation_resource_name: The path to write the ModelEvaluation for Model B to if Model B is a ModelRegistry Model.
evaluation_count: The count of how many evaluations were included for this AutoSxS run.
evaluation_dataset_path: The path to the overall evaluation dataset including judgments.
"""
# fmt: on
responses = batch_prediction_pairwise.batch_prediction_pairwise(
Expand Down Expand Up @@ -109,12 +123,29 @@ def autosxs_pipeline(
encryption_spec_key_name=encryption_spec_key_name,
).set_display_name('AutoSxS Autorater')

model_evaluation_text_generation_pairwise.model_evaluation_text_generation_pairwise(
metrics = model_evaluation_text_generation_pairwise.model_evaluation_text_generation_pairwise(
judgments_dir=winners.outputs['judgments_uri'],
human_preference_column=human_preference_column,
project=project,
location=location,
encryption_spec_key_name=encryption_spec_key_name,
model_a=model_a,
model_b=model_b,
evaluation_dataset=evaluation_dataset,
evaluation_dataset_metadata=winners.outputs['metadata'],
task=task,
).set_display_name(
'AutoSxS Metrics'
)

return PipelineOutput(
model_a_evaluation_resource_name=metrics.outputs[
'model_a_evaluation_path'
],
model_b_evaluation_resource_name=metrics.outputs[
'model_b_evaluation_path'
],
evaluation_count=metrics.outputs['evaluation_count_path'],
# Needs to be a component output
evaluation_dataset_path=metrics.outputs['evaluation_dataset_path'],
)

0 comments on commit 2abe91e

Please sign in to comment.