# Batch Prediction Pipeline Workaround for Reservations

Reference docs: https://google-cloud-pipeline-components.readthedocs.io/en/google-cloud-pipeline-components-2.20.0/ 


This is a sample, some parameters may differ base on your implementation

## Create custom component that deploys a model to with a GPU reservation

In [None]:
import kfp
from kfp import dsl
from kfp.dsl import (
    Input,
    Output,
)  # Common artifact types
from google_cloud_pipeline_components.types.artifact_types import (
    VertexEndpoint,
    VertexModel,
)


@dsl.component(
    base_image="python:3.11",
    packages_to_install=["google-cloud-aiplatform", "google-cloud-pipeline-components"]
)
def create_endpoint_with_reservation(
    endpoint: Input[VertexEndpoint],
    model: str,
    deployed_name: str,
    machine_type: str,
    accelerator_type: str,
    accelerator_count: int,
    reservation_zone: str,
    project_id: str,
    reservation_name: str,
    min_replica: int,
    max_replica: int,
    location: str,
    deployed_endpoint: Output[VertexEndpoint],
    deployed_model: Output[VertexModel],
) -> dict:
    from google_cloud_pipeline_components.types.artifact_types import (
        VertexModel,
    )
    from google.cloud import aiplatform

    aiplatform.init(
        project=project_id,
        location=location,
    )

    endpoint_fqn = endpoint.uri.split("v1/")[1]
    model_fqn = f"projects/{project_id}/locations/{location}/models/{model}"
    vertex_endpoint = aiplatform.Endpoint(endpoint_fqn)
    vertex_model = aiplatform.Model(model_name=model_fqn)

    vertex_endpoint.deploy(
        model=vertex_model,
        deployed_model_display_name=deployed_name,
        machine_type=machine_type,
        accelerator_type=accelerator_type,
        accelerator_count=accelerator_count,
        reservation_affinity_type="SPECIFIC_RESERVATION",
        reservation_affinity_key="compute.googleapis.com/reservation-name",
        reservation_affinity_values=[
            f"projects/{project_id}/zones/{reservation_zone}/reservations/{reservation_name}"
        ],
        min_replica_count=min_replica,
        max_replica_count=max_replica,
        sync=True,
    )
    return {
        "deployed_endpoint": endpoint,
        "deployed_model": VertexModel(model_resource_name=model),
    }

  return component_factory.create_component_from_func(


In [18]:
#### Bind SA to the reservation
PROJECT_NUMBER = 679926387543

! gcloud compute reservations add-iam-policy-binding \
    a100-custom-image-reservation \
    --zone=us-central1-b \
    --member="serviceAccount:service-$PROJECT_NUMBER@gcp-sa-aiplatform.iam.gserviceaccount.com" \
    --role="roles/compute.viewer" \
    --project=wortz-project-352116

Updated IAM policy for reservation [a100-custom-image-reservation].
bindings:
- members:
  - serviceAccount:vertex-sa@wortz-project-352116.iam.gserviceaccount.com
  role: roles/compute.admin
- members:
  - serviceAccount:vertex-sa@wortz-project-352116.iam.gserviceaccount.com
  role: roles/compute.futureReservationAdmin
- members:
  - serviceAccount:vertex-sa@wortz-project-352116.iam.gserviceaccount.com
  role: roles/compute.instanceAdmin
- members:
  - serviceAccount:679926387543-compute@developer.gserviceaccount.com
  - serviceAccount:service-679926387543@gcp-sa-aiplatform.iam.gserviceaccount.com
  - serviceAccount:vertex-sa@wortz-project-352116.iam.gserviceaccount.com
  role: roles/compute.viewer
etag: BwY2nVJOxlc=
version: 1


In [23]:
from kfp import dsl
from google_cloud_pipeline_components.v1.model import ModelUploadOp
from google_cloud_pipeline_components.v1.endpoint import (
    EndpointCreateOp,
    EndpointDeleteOp,
    ModelUndeployOp,
)

bucket = ("model_experimentation_2025",)


@dsl.pipeline(
    name="deploy-model-with-reserved-gpu",
    description="Deploys a model to an endpoint using a reserved GPU.",
    # pipeline_root=f"gs://{bucket}/pipeline_root",
)
def deploy_model_pipeline(
    project_id: str,
    model: str,
    region: str,
    shared_project_id: str,
    zone: str,
    reservation_name: str,
    endpoint_display_name: str,
    deployed_model_display_name: str,
    machine_type: str,
    accelerator_type: str,
):

    # 1. Create an endpoint
    create_endpoint_task = EndpointCreateOp(
        project=project_id,
        location=region,
        display_name=endpoint_display_name,
    )

    # 2. Deploy the model to the endpoint with reserved GPU
    model_deploy_op = create_endpoint_with_reservation(
        endpoint=create_endpoint_task.outputs["endpoint"],
        model=model,
        location=region,
        deployed_name=deployed_model_display_name,
        machine_type=machine_type,
        accelerator_type=accelerator_type,
        accelerator_count=1,
        reservation_zone=zone,
        project_id=shared_project_id,
        reservation_name=reservation_name,
        min_replica=1,
        max_replica=2,
    )
    # 3. TODO: Implement prediction job here via custom component

    # 4. Teardown of resources post-prediction
    model_undeploy_op = ModelUndeployOp(
        endpoint=model_deploy_op.outputs["deployed_endpoint"],
        model=model_deploy_op.outputs["deployed_model"],
        # traffic_split={"0": 100} # Optional: to ensure all traffic is removed from this model_id
        # If this is the only model, it will be removed.
    )

    delete_endpoint_op = EndpointDeleteOp(
        endpoint=model_deploy_op.outputs[
            "deployed_endpoint"
        ],  # Use the same endpoint from deploy op
    )
    delete_endpoint_op.after(model_undeploy_op)  # Explicitly set dependency

## Compile the pipeline

In [24]:
kfp.compiler.Compiler().compile(
    pipeline_func=deploy_model_pipeline,
    package_path="predict_w_reservations.json",
)

In [25]:
# Replace with your project ID, region, etc.
pipeline_params = dict(
    project_id="wortz-project-352116",
    model="3416616934593003520",
    region="us-central1",
    shared_project_id="wortz-project-352116",
    zone="us-central1-b",
    reservation_name="a100-custom-image-reservation",
    endpoint_display_name="Reservation_Endpoint",
    deployed_model_display_name="My_deployed_model",
    accelerator_type="NVIDIA_TESLA_A100",
    machine_type="a2-highgpu-1g",
)

In [None]:
from google import auth


credentials, project = auth.default(
    scopes=["https://www.googleapis.com/auth/cloud-platform"]
)

In [50]:
CREDS = ! gcloud auth print-access-token

In [None]:
import google.oauth2.credentials

credentials = google.oauth2.credentials.Credentials(CREDS[0])

In [None]:
from google.cloud import aiplatform

aiplatform.init(
    project=pipeline_params["project_id"],
    location=pipeline_params["region"],
    # credentials=credentials,
    service_account="vertex-sa@wortz-project-352116.iam.gserviceaccount.com",
)
job = aiplatform.PipelineJob(
    display_name=f"Predictions with GPU Reservations",
    template_path="predict_w_reservations.json",
    # pipeline_root=f"gs://{bucket}/pipeline_runs",
    parameter_values=pipeline_params,
    project=pipeline_params["project_id"],
    location=pipeline_params["region"],
    enable_caching=True,
    # credentials=credentials,
)

job.submit()

Creating PipelineJob
PipelineJob created. Resource name: projects/679926387543/locations/us-central1/pipelineJobs/deploy-model-with-reserved-gpu-20250602162032
To use this PipelineJob in another session:
pipeline_job = aiplatform.PipelineJob.get('projects/679926387543/locations/us-central1/pipelineJobs/deploy-model-with-reserved-gpu-20250602162032')
View Pipeline Job:
https://console.cloud.google.com/vertex-ai/locations/us-central1/pipelines/runs/deploy-model-with-reserved-gpu-20250602162032?project=679926387543


In [None]:
#### WIP

In [1]:
# import json

# one_sample = datasets["test"].map(scale).take(1)
# list_data = list(one_sample.as_numpy_iterator())
# instances = {"instances": list_data[0][0].tolist()}
# json_str = json.dumps(instances)
# with open("data.json", "w", encoding="utf-8") as f:
#     json.dump(json_str, f, ensure_ascii=False, indent=4)

# New Local Testing

In [14]:
def create_endpoint_with_reservation_local(
    endpoint_uri: str,
    model: str,
    deployed_name: str,
    machine_type: str,
    accelerator_type: str,
    accelerator_count: int,
    reservation_zone: str,
    project_id: str,
    reservation_name: str,
    min_replica: int,
    max_replica: int,
    location: str,
    SA: str,
) -> dict:
    from google_cloud_pipeline_components.types.artifact_types import (
        VertexModel,
    )
    from google.cloud import aiplatform

    aiplatform.init(
        project=project_id,
        location=location,
        service_account=SA,
    )

    endpoint_fqn = endpoint_uri.split("v1/")[1]
    model_fqn = f"projects/{project_id}/locations/{location}/models/{model}"
    vertex_endpoint = aiplatform.Endpoint(endpoint_fqn)
    vertex_model = aiplatform.Model(model_name=model_fqn)

    vertex_endpoint.deploy(
        model=vertex_model,
        deployed_model_display_name=deployed_name,
        machine_type=machine_type,
        accelerator_type=accelerator_type,
        accelerator_count=accelerator_count,
        reservation_affinity_type="SPECIFIC_RESERVATION",
        reservation_affinity_key="compute.googleapis.com/reservation-name",
        reservation_affinity_values=[
            f"projects/{project_id}/zones/{reservation_zone}/reservations/{reservation_name}"
        ],
        min_replica_count=min_replica,
        max_replica_count=max_replica,
        sync=True,
        service_account=SA,
    )
    return {
        "deployed_endpoint": vertex_endpoint,
        "deployed_model": VertexModel(model_resource_name=model),
    }

In [16]:
endpoint_uri = 'https://us-central1-aiplatform.googleapis.com/v1/projects/679926387543/locations/us-central1/endpoints/2591835879202881536'

create_endpoint_with_reservation_local(endpoint_uri=endpoint_uri,
    model=pipeline_params['model'],
    deployed_name=pipeline_params['deployed_model_display_name'],
    machine_type=pipeline_params['machine_type'],
    accelerator_type=pipeline_params['accelerator_type'],
    accelerator_count=1,
    reservation_zone=pipeline_params['zone'],
    project_id=pipeline_params['project_id'],
    reservation_name=pipeline_params['reservation_name'],
    min_replica=1,
    max_replica=2,
    location=pipeline_params['region'],
    SA='vertex-sa@wortz-project-352116.iam.gserviceaccount.com')

Deploying Model projects/679926387543/locations/us-central1/models/3416616934593003520 to Endpoint : projects/679926387543/locations/us-central1/endpoints/2591835879202881536
Deploy Endpoint model backing LRO: projects/679926387543/locations/us-central1/endpoints/2591835879202881536/operations/6421892653540966400


KeyboardInterrupt: 

### Try the rest method
https://cloud.google.com/vertex-ai/docs/predictions/use-reservations#deploy_model_reservation-drest

In [6]:
### Try the rest method 
# https://cloud.google.com/vertex-ai/docs/predictions/use-reservations#deploy_model_reservation-drest

import requests
import json


def call_post_api(api_url: str, payload: str, token: str):
    """
    Calls a POST REST API with the given URL and payload.

    Args:
        api_url (str): The URL of the API endpoint.
        payload (dict): A dictionary representing the JSON payload to send.
        token (str, optional): The access token for the REST call

    Returns:
        dict or None: The JSON response from the API as a dictionary if successful,
                      or None if an error occurred.
    """
    if token is None:
        headers = {"Content-Type": "application/json"}
    else:
        # Ensure Content-Type is set, default to application/json if not present
        headers = {
            "Content-Type": "application/json",
            "Authorization": f"Bearer {token}",
        }

    try:
        # Convert the payload dictionary to a JSON string
        json_payload = json.dumps(payload)

        # Make the POST request
        response = requests.post(
            api_url, data=json_payload, headers=headers, timeout=10
        )  # 10-second timeout

        # Raise an exception for bad status codes (4xx or 5xx)
        response.raise_for_status()

        # Assuming the response is JSON, parse it
        return response.json()

    except requests.exceptions.HTTPError as http_err:
        print(f"HTTP error occurred: {http_err}")
        print(f"Response content: {response.content}")
    except requests.exceptions.ConnectionError as conn_err:
        print(f"Connection error occurred: {conn_err}")
    except requests.exceptions.Timeout as timeout_err:
        print(f"Timeout error occurred: {timeout_err}")
    except requests.exceptions.RequestException as req_err:
        print(f"An unexpected error occurred with the request: {req_err}")
    except json.JSONDecodeError:
        print("Error decoding JSON response. Response was not valid JSON.")
        print(
            f"Response content: {response.text}"
        )  # Show raw text if JSON parsing fails

    return None

In [7]:
def deploy_reserved_model_rest_api(
    token: str,
    endpoint_id="2591835879202881536",
    model=pipeline_params["model"],
    deployed_name=pipeline_params["deployed_model_display_name"],
    machine_type=pipeline_params["machine_type"],
    accelerator_type=pipeline_params["accelerator_type"],
    accelerator_count=1,
    reservation_zone=pipeline_params["zone"],
    project_id=pipeline_params["project_id"],
    reservation_name=pipeline_params["reservation_name"],
    min_replica=1,
    max_replica=2,
    location=pipeline_params["region"],
):
    url = f"https://{location}-aiplatform.googleapis.com/v1/projects/{project_id}/locations/{location}/endpoints/{endpoint_id}:deployModel"
    request_body = {
        "deployedModel": {
            "model": f"projects/{project_id}/locations/{location}/models/{model}",
            "displayName": deployed_name,
            "dedicatedResources": {
                "machineSpec": {
                    "machineType": machine_type,
                    "acceleratorType": accelerator_type,
                    "acceleratorCount": accelerator_count,
                    "reservationAffinity": {
                        "reservationAffinityType": "SPECIFIC_RESERVATION",
                        "key": "compute.googleapis.com/reservation-name",
                        "values": [
                            f"projects/{project_id}/zones/{reservation_zone}/reservations/{reservation_name}"
                        ],
                    },
                },
                "minReplicaCount": min_replica,
                "maxReplicaCount": max_replica,
            },
        },
        "trafficSplit": {
          "0": 100,
        },
    }
    print(url, '\n',  json.dumps(request_body))
    return call_post_api(api_url=url, payload=request_body, token=token)

In [8]:
token = ! gcloud auth print-access-token
deploy_reserved_model_rest_api(token[0])

https://us-central1-aiplatform.googleapis.com/v1/projects/wortz-project-352116/locations/us-central1/endpoints/2591835879202881536:deployModel 
 {"deployedModel": {"model": "projects/wortz-project-352116/locations/us-central1/models/3416616934593003520", "displayName": "My_deployed_model", "dedicatedResources": {"machineSpec": {"machineType": "a2-highgpu-1g", "acceleratorType": "NVIDIA_TESLA_A100", "acceleratorCount": 1, "reservationAffinity": {"reservationAffinityType": "SPECIFIC_RESERVATION", "key": "compute.googleapis.com/reservation-name", "values": ["projects/wortz-project-352116/zones/us-central1-b/reservations/a100-custom-image-reservation"]}}, "minReplicaCount": 1, "maxReplicaCount": 2}}, "trafficSplit": {"0": 100}}


{'name': 'projects/679926387543/locations/us-central1/endpoints/2591835879202881536/operations/3729303026326831104',
 'metadata': {'@type': 'type.googleapis.com/google.cloud.aiplatform.v1.DeployModelOperationMetadata',
  'genericMetadata': {'createTime': '2025-06-02T20:10:47.248469Z',
   'updateTime': '2025-06-02T20:10:47.248469Z'}}}

#### Validate the operation from the REST call

In [9]:
### Get the operation status
OP_ID = '3729303026326831104'
! curl -X GET \
  -H "Authorization: Bearer $(gcloud auth print-access-token)" \
    "https://us-central1-aiplatform.googleapis.com/v1/projects/wortz-project-352116/locations/us-central1/operations/$OP_ID"

{
  "name": "projects/679926387543/locations/us-central1/endpoints/2591835879202881536/operations/3729303026326831104",
  "metadata": {
    "@type": "type.googleapis.com/google.cloud.aiplatform.v1.DeployModelOperationMetadata",
    "genericMetadata": {
      "createTime": "2025-06-02T20:10:47.248469Z",
      "updateTime": "2025-06-02T20:10:48.653532Z"
    }
  },
  "done": true,
  "error": {
    "code": 7,
    "message": "Permission denied getting reservation [projects/wortz-project-352116/zones/us-central1-b/reservations/a100-custom-image-reservation]"
  }
}
