In [None]:
# Copyright 2024 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

## VAPO Custom metric Example 2-3

- [Original Reference](https://github.com/GoogleCloudPlatform/generative-ai/blob/main/gemini/prompts/prompt_optimizer/vertex_ai_prompt_optimizer_sdk_custom_metric.ipynb) from [Ivan Nardini](https://github.com/inardini)

### Objective

This notebook demonstrates how to leverage Vertex AI prompt optimizer to optimize a simple prompt for a Gemini model using your own metric. The goal is to use Vertex AI prompt optimizer to find a new prompt template that generates better responses based on your own optimization metric.

This tutorial uses the following Google Cloud services and resources:

- Generative AI on Vertex AI
- Vertex AI prompt optimizer
- Vertex AI Gen AI evaluation
- Vertex AI Custom job
- Cloud Run

The steps performed include:

1. Define the prompt template you want to optimize.
2. Prepare the prompt optimization dataset.
3. Define and deploy your own custom evaluation metric on Cloud function.
4. Set optimization mode and steps.
5. Run the automatic prompt optimization job.
6. Collect the best prompt template and eval metric.
7. Validate the best prompt template.

### Costs

This tutorial uses billable components of Google Cloud:

* Vertex AI
* Cloud Storage

Learn about [Vertex AI pricing](https://cloud.google.com/vertex-ai/pricing) and [Cloud Storage pricing](https://cloud.google.com/storage/pricing) and use the [Pricing Calculator](https://cloud.google.com/products/calculator/) to generate a cost estimate based on your projected usage.

## 2. Before you start

### Install Vertex AI SDK for Python and other required packages


In [None]:
# %pip install --quiet 'evaluate' 'jiwer'
%pip install --upgrade --quiet 'google-cloud-aiplatform[evaluation]'
%pip install --upgrade --quiet 'plotly' 'asyncio' 'tqdm' 'tenacity' 'etils' 'importlib_resources' 'fsspec' 'gcsfs' 'nbformat>=4.2.0'

In [None]:
!wget https://raw.githubusercontent.com/GoogleCloudPlatform/generative-ai/main/gemini/prompts/prompt_optimizer/vapo_lib.py
import vapo_lib

In [None]:
! mkdir -p ./tutorial/utils && wget https://raw.githubusercontent.com/GoogleCloudPlatform/generative-ai/main/gemini/prompts/prompt_optimizer/vapo_lib.py -P ./tutorial/utils

### Authenticate your notebook environment (Colab only)

Authenticate your environment on Google Colab.


In [None]:
import sys

if "google.colab" in sys.modules:
    try:
        import google.auth
        import google.auth.transport.requests
        from google.colab import auth

        auth.authenticate_user()
        creds, project = google.auth.default()
        authentication = google.auth.transport.requests.Request()
        if creds.token:
            print("Authentication successful.")
        else:
            print("Authentication successful, but no token was returned.")
    except Exception as e:
        print(f"Error during Colab authentication: {e}")

! gcloud auth login

### Set Google Cloud project information

To get started using Vertex AI, you must have an existing Google Cloud project and [enable the following APIs](https://console.cloud.google.com/flows/enableapi?apiid=cloudresourcemanager.googleapis.com,aiplatform.googleapis.com,cloudfunctions.googleapis.com,run.googleapis.com).

Learn more about [setting up a project and a development environment](https://cloud.google.com/vertex-ai/docs/start/cloud-environment).

#### Set your project ID and project number

In [None]:
PROJECT_ID = "$YOUR_PROJECT_ID"  # @param {type:"string"}

# Set the project id
! gcloud config set project {PROJECT_ID}

In [None]:
PROJECT_NUMBER = !gcloud projects describe {PROJECT_ID} --format="get(projectNumber)"[0]
PROJECT_NUMBER = PROJECT_NUMBER[0]

#### Region

You can also change the `REGION` variable used by Vertex AI. Learn more about [Vertex AI regions](https://cloud.google.com/vertex-ai/docs/general/locations).

In [None]:
REGION = "us-central1" #"asia-northeast3"  # @param {type: "string"}

#### Create a Cloud Storage bucket

Create a storage bucket to store intermediate artifacts such as datasets.

In [None]:
BUCKET_NAME = "$YOUR_BUCKET_NAME"  # @param {type:"string"}
BUCKET_URI = f"gs://{BUCKET_NAME}"  # @param {type:"string"}

In [None]:
# ! gsutil mb -l {REGION} -p {PROJECT_ID} {BUCKET_URI}

#### Service Account and permissions

Vertex AI Prompt optimizer requires a service account with the following permissions:

-   `Vertex AI User` to call Vertex LLM API
-   `Storage Object Admin` to read and write to your GCS bucket.
-   `Artifact Registry Reader` to download the pipeline template from Artifact Registry.
-   `Cloud Run Developer` to deploy function on Cloud Run.

[Check out the documentation](https://cloud.google.com/iam/docs/manage-access-service-accounts#iam-view-access-sa-gcloud) to learn how to grant those permissions to a single service account.


> If you run following commands using Vertex AI Workbench, please directly run in the terminal.


In [None]:
SERVICE_ACCOUNT = f"{PROJECT_NUMBER}-compute@developer.gserviceaccount.com"

In [None]:
for role in ['aiplatform.user', 'storage.objectAdmin', 'artifactregistry.reader', 'run.developer', 'run.invoker']:

    ! gcloud projects add-iam-policy-binding {PROJECT_ID} \
      --member=serviceAccount:{SERVICE_ACCOUNT} \
      --role=roles/{role} --condition=None

### Set tutorial folder and workspace

Set a local folder to collect and organize data and any tutorial artifacts.

In [None]:
from pathlib import Path as path

ROOT_PATH = path.cwd()
TUTORIAL_PATH = ROOT_PATH / "tutorial_case2_3_custom"
BUILD_PATH = TUTORIAL_PATH / "build_case2_3_custom"

TUTORIAL_PATH.mkdir(parents=True, exist_ok=True)
BUILD_PATH.mkdir(parents=True, exist_ok=True)

Set an associated workspace to store prompt optimization results on Cloud Storage bucket.

In [None]:
from etils import epath

WORKSPACE_URI = epath.Path(BUCKET_URI) / "optimization_case2_3_custom"
INPUT_DATA_URI = epath.Path(WORKSPACE_URI) / "data_case2_3_custom"

WORKSPACE_URI.mkdir(parents=True, exist_ok=True)
INPUT_DATA_URI.mkdir(parents=True, exist_ok=True)

### Import libraries

In [None]:
# Tutorial
from argparse import Namespace
import json

# General
import logging
from pprint import pprint
import warnings

from IPython.display import HTML, display
from google.cloud import aiplatform
import pandas as pd
import requests
from sklearn.model_selection import train_test_split

### Libraries settings

In [None]:
warnings.filterwarnings("ignore")
logging.getLogger("urllib3.connectionpool").setLevel(logging.ERROR)

### Define constants

In [None]:
INPUT_DATA_FILE_URI = "$YOUR_DATA_URI"


INPUT_OPTIMIZATION_DATA_URI = epath.Path(WORKSPACE_URI) / "prompt_optimization_data"
INPUT_OPTIMIZATION_DATA_FILE_URI = str(
    INPUT_DATA_URI / "prompt_optimization_dataset.jsonl"
)
OUTPUT_OPTIMIZATION_DATA_URI = epath.Path(WORKSPACE_URI) / "optimization_jobs"
APD_CONTAINER_URI = (
    "us-docker.pkg.dev/vertex-ai-restricted/builtin-algorithm/apd:preview_v1_0"
)
CONFIG_FILE_URI = str(WORKSPACE_URI / "config" / "config.json")

### Initialize Vertex AI SDK for Python

Initialize the Vertex AI SDK for Python for your project.

In [None]:
print(REGION)

In [None]:
aiplatform.init(project=PROJECT_ID, location=REGION, staging_bucket=BUCKET_URI)

## 3. Automated prompt design with Vertex AI prompt optimizer

### Load the dataset

Load the cooking question-answer dataset from a Google Cloud Storage bucket. The dataset contains the following columns:

*   `user_question`: The cooking question posed by the user to the AI cooking assistant.
*   `context`: Relevant information retrieved to answer the user's question.
*   `prompt`: The content fed to the language model to generate an answer.
*   `answer`: The generated answer from the language model.
*   `reference`: The ground truth answer—the ideal response the user expects from the AI cooking assistant.

In [None]:
prompt_optimization_df = pd.read_json(INPUT_DATA_FILE_URI, lines=True)

In [None]:
prompt_optimization_df.head()

In [None]:
vapo_lib.print_df_rows(prompt_optimization_df[['target', 'audio']], n=1)

### Optimize the prompt template with Vertex AI prompt optimizer with custom metric


#### Prepare the prompt template you want to optimize

A prompt consists of two key parts:

* **System Instruction Template** which is a fixed part of the prompt that control or alter the model's behavior across all queries for a given task.

* **Prompt Template** which is a dynamic part of the prompt that changes based on the task. Prompt template includes context, task and more. To learn more, see [components of a prompt](https://cloud.google.com/vertex-ai/generative-ai/docs/learn/prompts/prompt-design-strategies#components-of-a-prompt) in the official documentation.

In this scenario, you use Vertex AI prompt optimizer to optimize a simple system instruction template. And you use some examples in the remaining prompt template for evaluating different instruction templates along the optimization process.

> Having the `target` placeholder in the prompt template is optional. It represents the prompt's ground truth response in your prompt optimization dataset that you aim to optimize for your templates. If you don't have the prompt's ground truth response, remember to set the `source_model` parameter to your prompt optimizer configuration (see below) instead of adding ground truth responses. Vertex AI prompt optimizer would run your sample prompts on the source model to generate the ground truth responses for you.

In [None]:
SYSTEM_INSTRUCTION_TEMPLATE = """
Generate a transcript of the speech. Only include the transcript in your response, and do not provide any other answer.
"""

PROMPT_TEMPLATE = """
Speech: {{audio}} @@@audio/wav
Answer : {{target}}
"""

#### Prepare the prompt optimization dataset

To use Vertex AI prompt optimizer, you'll need a CSV or JSONL file with labeled examples.  These examples should follow a specific naming convention. For details see [Optimize prompts](https://cloud.google.com/vertex-ai/generative-ai/docs/learn/prompts/prompt-optimizer).


> For effective **prompt optimization**, provide a dataset of examples where your model is poor in performance when using current system instruction template. For reliable results, use 50-100 distinct samples.

> In case of **prompt migration**, consider using the source model to label examples that the target model struggles with, helping to identify areas for improvement.

#### Upload samples to bucket

Once you prepare your prompt optimization dataset, you can upload them on Cloud Storage bucket.

In [None]:
print(INPUT_OPTIMIZATION_DATA_FILE_URI)

In [None]:
prompt_optimization_df.to_json(
    INPUT_OPTIMIZATION_DATA_FILE_URI, orient="records", lines=True
)

#### Define and deploy your own custom optimization metric on Cloud function

To optimize your prompt template using a custom optimization metric, you need to deploy a function with your own metric code on a Cloud function. To deploy a Cloud function with your own custom metric, you cover the following steps:

1.   Define requirements
2.   Write your own custom metric function code
3.   Deploy the custom code as Cloud function


##### Define requirements

Set the custom metric dependencies.

In [None]:
requirements = """
functions-framework==3.*
google-cloud-aiplatform
evaluate
jiwer

"""

with open(BUILD_PATH / "requirements.txt", "w") as f:
    f.write(requirements)

##### Write your own custom metric function

Define the module which contains your own custom metric function definition.

In this case, you have a custom evaluation metric to evaluate the user engagement and personalization. The custom evaluation metric is defined using the `evaluate_engagement_personalization_fn`.

The function leverages "gemini-2.0-flash" to act as an "auto-rater". It sends a prompt to the auto-rater, receives a score (1-5), and an explanation, then returns these as a dictionary containing two fields: the custom metric's score (as you defined it) and an explanation of how this metric helps optimize the prompt template.

You use the `main` function to deploy the `evaluate_engagement_personalization_fn` function as a Cloud Function, receiving a question, response, and a target response as input and returning the auto-rater's evaluation.  


In [None]:
custom_metric_function_code = '''
"""
This module contains the custom evaluation metric definition to optimize a prompt template with Vertex AI prompt optimizer
"""

from typing import Dict
from vertexai.generative_models import (
  GenerationConfig,
  GenerativeModel
)

import json
import functions_framework
import evaluate
import string

wer_metric = evaluate.load("wer")
cer_metric = evaluate.load("cer")
autorater = GenerativeModel("gemini-2.5-flash")

def remove_punctuation(text):
    try:
      punctuations = string.punctuation
      translator = str.maketrans('', '', punctuations)
      return text.translate(translator)
    except:
      print(f"## ERROR when removing punc : {text}")
      return text

def calculate_weighted_score(inference_refined: str, reference_refined: str) -> float:
  wer_score = wer_metric.compute(predictions=[inference_refined], references=[reference_refined])
  cer_score = cer_metric.compute(predictions=[inference_refined], references=[reference_refined])

  final_score = 3 - (wer_score + 2*cer_score)

  return final_score


def get_explanation(inference_refined: str, reference_refined: str) -> str:
  prompt = f"""Evaluate the STT (Speech-to-Text) performance based on the items provided below.

  inference: {inference_refined}
  reference: {reference_refined}

  Compare the inference text against the reference text and identify all errors (e.g., substitutions, deletions, insertions) in the inference.

  """
  response = autorater.generate_content(prompt)
  explanation = response.text

  return explanation


# Define custom evaluation criteria
def evaluate_engagement_personalization_fn(final_score: float, explanation:str) -> Dict[str, str]:
  return {
      "custom_engagement_personalization_score": final_score,
      "explanation": explanation,
  }



# Register an HTTP function with the Functions Framework
@functions_framework.http
def main(request):
  default_return = {"custom_engagement_personalization_score": 0, "explanation": "Wrong format in somewhere"}
  request_json = request.get_json(silent=True)
  if not request_json:
    raise ValueError('Cannot find request json.')

  try:
    inference_refined = remove_punctuation(request_json['response'])
    reference_refined = remove_punctuation(request_json['target'])

    final_score = calculate_weighted_score(inference_refined, reference_refined)
    explanation = get_explanation(inference_refined, reference_refined)

    get_evaluation_result = evaluate_engagement_personalization_fn(final_score, explanation)
    return json.dumps(get_evaluation_result)

  except Exception as e:
    print("##Exception##")
    print(e)
    print("##Response##")
    print(response)

    return json.dumps(default_return)

'''

with open(BUILD_PATH / "main.py", "w") as f:
    f.write(custom_metric_function_code)

##### Deploy the custom metric as a Cloud Function

Use gcloud command line to deploy a Cloud function. To learn more, check out [Deploy a Python service to Cloud Run](https://cloud.google.com/run/docs/quickstarts/build-and-deploy/deploy-python-service) quickstart.


In [None]:
!gcloud functions deploy 'custom_metric_case_2_3_transcription' \
 --gen2 \
 --runtime="python310" \
 --source={str(BUILD_PATH)} \
 --entry-point=main \
 --trigger-http \
 --timeout=3600 \
 --memory=2Gb \
 --concurrency=6 \
 --min-instances=6 \
 --project {PROJECT_ID} \
 --region={REGION} \
 --quiet

##### Test your custom evaluation metric

After you deploy your  custom evaluation metric as Cloud function, submit a request to validate the output of the custom evaluation function.

In [None]:
! gcloud functions describe 'custom_metric_case_2_3_transcription' --gen2 --region {REGION} --format="value(url)"

In [None]:
custom_evaluator_function_uri = ! gcloud functions describe 'custom_metric_case_2_3_transcription' --gen2 --region {REGION} --format="value(url)"
custom_evaluator_function_uri = custom_evaluator_function_uri[0].strip()

In [None]:
print(custom_evaluator_function_uri)

In [None]:
dummy_token = 'ABC' # get_auth_token()
#     "Authorization": f"Bearer {dummy_token}",
headers = {
    "Content-Type": "application/json",
}

# json_data = {
#     "response": "Hey!!",
#     "target": "Hey",
# }


json_data = {
    "response": "가나안에는 큰 숲이 없었기 때문에 나무가 무척 비쌌다",
    "target": "기니안에는 큰 숲이 없었기 때문에 나무가 무척 비쌌다",
}

response = requests.post(custom_evaluator_function_uri, headers=headers, json=json_data, timeout=70) #.json()
pprint(response)

In [None]:
response.json()

#### Configure optimization settings

Vertex AI prompt optimizer lets you control the optimization process by specifying what to optimize (instructions only, demonstrations only, or both), providing a system instruction and prompt template, and selecting the target model.  You can optionally refine the optimization with some advanced settings like its duration and the number of optimization iterations it runs, which models the Vertex AI prompt optimizer uses, and other parameters to control the structure and content of prompts. Below you have some common and recommended default configurations.

In this scenario, you set two additional parameters:

* `custom_metric_name` parameter which lets you pass your own custom metric to optimizer the prompt template.

* `custom_metric_cloud_function_name` parameter which indicates the Cloud function to call for collecting custom function evaluation metric output.

For more advanced control, you can learn and explore more about all the parameters and how to best use them in the [detailed documentation](https://cloud.google.com/vertex-ai/generative-ai/docs/learn/prompts/prompt-optimizer).


In [None]:
## Important setting : response_mime_type="application/json"

PROMPT_OPTIMIZATION_JOB = "auto-prompt-design-job-" + vapo_lib.get_id()
OUTPUT_OPTIMIZATION_RUN_URI = str(
    OUTPUT_OPTIMIZATION_DATA_URI / PROMPT_OPTIMIZATION_JOB
)

args = Namespace(
    # Basic configuration
    system_instruction=SYSTEM_INSTRUCTION_TEMPLATE,  # System instructions for the target model. String.
    prompt_template=PROMPT_TEMPLATE,  # Template for prompts,  String.
    target_model="gemini-2.5-flash",  # Target model for optimization. String. Supported models: "gemini-2.5-flash-lite", "gemini-2.5-flash", "gemini-2.5-pro", "gemini-2.0-flash-lite-001", "gemini-2.0-flash-001"
    thinking_budget=-1,  # Thinking budget for thinking models. -1 means no thinking for non-thinking models and auto thinking for thinking models. Integer.
    optimization_mode="instruction_and_demo",  # Optimization mode. String. Supported modes: "instruction", "demonstration", "instruction_and_demo"
    custom_metric_name="custom_engagement_personalization_score",  # Metric name, as defined by the key that corresponds in the dictionary returned from Cloud function. String.
    custom_metric_cloud_function_name="custom_metric_case_2_3_transcription",  # Cloud Run function name you previously deployed. String.
    eval_metrics_types=[
        "custom_metric",
    ],  # List of evaluation metrics. List of strings. Supported metrics: "bleu", "coherence", "comet", "exact_match", "fluency", "groundedness", "metricx", "rouge_1", "rouge_2", "rouge_l", "rouge_l_sum", "safety", "question_answering_correctness", "question_answering_quality", "summarization_quality", "text_quality", "verbosity", "tool_call_valid", "tool_name_match", "tool_parameter_key_match", "tool_parameter_kv_match"
    eval_metrics_weights=[
        1.0
    ],  # Weights for evaluation metrics. List of floats.  Length must match eval_metrics_types.  Should sum to 1.
    aggregation_type="weighted_sum",  # Aggregation type for evaluation metrics. String. Supported aggregation types: "weighted_sum", "weighted_average"
    input_data_path=INPUT_OPTIMIZATION_DATA_FILE_URI,  # Cloud Storage URI to input optimization data. String.
    output_path=OUTPUT_OPTIMIZATION_RUN_URI,  # Cloud Storage URI to save optimization results. String.
    project=PROJECT_ID,  # Google Cloud project ID. String.
    # (Optional) Advanced configuration
    num_steps=10,  # Number of iterations in instruction optimization mode. Integer between 10 and 20.
    num_demo_set_candidates=10,  # Number of demonstrations evaluated in instruction and instruction_and_demo mode. Integer between 10 and 30.
    demo_set_size=3,  # Number of demonstrations generated per prompt. Integer between 3 and 6.
    target_model_location="us-central1", # Location of the target model. String. Default us-central1.
    optimizer_model_location="us-central1", # Location of the optimizer model. String. Default us-central1.
    source_model="",  # Google model that the system instructions and prompts were previously used with. String. Not needed if you provide target column.
    source_model_location="",  # Location of the source model. String. Default us-central1. Not needed if you provide target column.
    target_model_qps=1,  # The queries per second (QPS) sent to the target model. Integer greater or equal than 1 depending on your quota.
    optimizer_model_qps=1,  # The queries per second (QPS) sent to the optimization model. Integer greater or equal than 1 depending on your quota.
    eval_qps=1,  # The queries per second (QPS) sent to the eval model. Integer greater or equal than 1 depending on your quota.
    source_model_qps="",  # The queries per second (QPS) sent to the source model. Integer greater or equal than 1 depending on your quota.
    response_mime_type="text/plain",  # MIME response type that the target model uses. String. Supported response: text/plain, text/x.enum, application/json.
    response_schema="",  # The Vertex AI's Controlled Generation response schema that the target model uses to generate answers. String.
    language="Korean",  # Language of the system instructions. String. Supported languages: "English", "French", "German", "Hebrew", "Hindi", "Italian", "Japanese", "Korean", "Portuguese", "Simplified Chinese", "Spanish", "Traditional Chinese"
    placeholder_to_content=json.loads(
        "{}"
    ),  # Placeholder to replace any parameter in the system instruction. Dict.
    data_limit=10,  # Amount of data used for validation. Integer between 5 and 100.
    translation_source_field_name="",  # Fill in with the corresponding field name of the source text in the data if translation metrics like Comet or MetricX are selected. Otherwise, leave it as empty.
)

#### Upload Vertex AI prompt optimizer Cloud Storage

After you define Vertex AI prompt optimizer configuration, you upload them on Cloud Storage bucket.


In [None]:
args = vars(args)

with epath.Path(CONFIG_FILE_URI).open("w") as config_file:
    json.dump(args, config_file)
config_file.close()

#### Run the automatic prompt optimization job

Now you are ready to run your first Vertex AI prompt optimizer job using the Vertex AI SDK for Python.

> This prompt optimization job requires ~ 40 minutes to run.

> Be sure you have provisioned enough queries per minute (QPM) quota implementing the recommended QPM for each model. If you configure the Vertex AI prompt optimizer with a QPM that is higher than the QPM than you have access to, the job might fail. [Check out](https://cloud.google.com/vertex-ai/generative-ai/docs/learn/prompts/prompt-optimizer#before-you-begin) the documentation to know more.


In [None]:
WORKER_POOL_SPECS = [
    {
        "machine_spec": {
            "machine_type": "n1-standard-4",
        },
        "replica_count": 1,
        "container_spec": {
            "image_uri": APD_CONTAINER_URI,
            "args": ["--config=" + CONFIG_FILE_URI],
        },
    }
]

custom_job = aiplatform.CustomJob(
    display_name=PROMPT_OPTIMIZATION_JOB,
    worker_pool_specs=WORKER_POOL_SPECS,
)

custom_job.submit(service_account=SERVICE_ACCOUNT)

In [None]:
custom_job

### Collect and display the optimization results

Vertex AI prompt optimizer returns both optimized templates and evaluation results for either instruction, or demostrations, or both depending on the optimization mode you define as JSONL files on Cloud Storage bucket. Those results help you understand the optimization process.

In this case, you want to collect the optimized templates and evaluation results for the system instruction.

Below you use a helper function to display those results.

In [None]:
print(OUTPUT_OPTIMIZATION_RUN_URI)

In [None]:
results_ui = vapo_lib.ResultsUI(OUTPUT_OPTIMIZATION_RUN_URI)
results_df_html = """

"""

display(HTML(results_df_html))
display(results_ui.get_container())

## 4. Clean up

In [None]:
delete_bucket = False
delete_job = False
delete_run = False
delete_tutorial = False

if delete_bucket:
    ! gsutil rm -r {BUCKET_URI}

if delete_job:
    custom_job.delete()

if delete_run:
    ! gcloud functions delete 'custom_metric_case_2_3_transcription' --region={REGION}

if delete_tutorial:
    import shutil

    shutil.rmtree(str(TUTORIAL_PATH))