# Batch Prediction Pipeline Workaround for Reservations

Reference docs: https://google-cloud-pipeline-components.readthedocs.io/en/google-cloud-pipeline-components-2.20.0/ 


This is a sample, some parameters may differ base on your implementation

## Create custom component that deploys a model to with a GPU reservation

In [None]:
import kfp
from kfp import dsl
from kfp.dsl import (
    Input,
    Output,
)  # Common artifact types
from google_cloud_pipeline_components.types.artifact_types import (
    VertexEndpoint,
    VertexModel,
)


@dsl.component(
    packages_to_install=["google-cloud-aiplatform", "google-cloud-pipeline-components"]
)
def create_endpoint_with_reservation(
    endpoint: Input[VertexEndpoint],
    model: str,
    deployed_name: str,
    machine_type: str,
    accelerator_type: str,
    accelerator_count: int,
    reservation_zone: str,
    project_id: str,
    reservation_name: str,
    min_replica: int,
    max_replica: int,
    location: str,
    deployed_endpoint: Output[VertexEndpoint],
    deployed_model: Output[VertexModel],
) -> dict:
    from google_cloud_pipeline_components.types.artifact_types import (
        VertexModel,
    )
    from google.cloud import aiplatform

    aiplatform.init(
        project=project_id,
        location=location,
    )

    endpoint_fqn = endpoint.uri.split("v1/")[1]
    model_fqn = f"projects/{project_id}/locations/{location}/models/{model}"
    vertex_endpoint = aiplatform.Endpoint(endpoint_fqn)
    vertex_model = aiplatform.Model(model_name=model_fqn)

    vertex_endpoint.deploy(
        model=vertex_model,
        deployed_model_display_name=deployed_name,
        machine_type=machine_type,
        accelerator_type=accelerator_type,
        accelerator_count=accelerator_count,
        reservation_affinity_type="SPECIFIC_RESERVATION",
        reservation_affinity_key="compute.googleapis.com/reservation-name",
        reservation_affinity_values=[
            f"projects/{project_id}/zones/{reservation_zone}/reservations/{reservation_name}"
        ],
        min_replica_count=min_replica,
        max_replica_count=max_replica,
        sync=True,
    )
    return {
        "deployed_endpoint": endpoint,
        "deployed_model": VertexModel(model_resource_name=model),
    }

In [None]:
from kfp import dsl
from google_cloud_pipeline_components.v1.model import ModelUploadOp
from google_cloud_pipeline_components.v1.endpoint import (
    EndpointCreateOp,
    EndpointDeleteOp,
    ModelUndeployOp,
)

bucket = ("model_experimentation_2025",)


@dsl.pipeline(
    name="deploy-model-with-reserved-gpu",
    description="Deploys a model to an endpoint using a reserved GPU.",
    # pipeline_root=f"gs://{bucket}/pipeline_root",
)
def deploy_model_pipeline(
    project_id: str,
    model: str,
    region: str,
    shared_project_id: str,
    zone: str,
    reservation_name: str,
    endpoint_display_name: str,
    deployed_model_display_name: str,
    machine_type: str,
    accelerator_type: str,
):

    # 1. Create an endpoint
    create_endpoint_task = EndpointCreateOp(
        project=project_id,
        location=region,
        display_name=endpoint_display_name,
    )

    # 2. Deploy the model to the endpoint with reserved GPU
    model_deploy_op = create_endpoint_with_reservation(
        endpoint=create_endpoint_task.outputs["endpoint"],
        model=model,
        location=region,
        deployed_name=deployed_model_display_name,
        machine_type=machine_type,
        accelerator_type=accelerator_type,
        accelerator_count=1,
        reservation_zone=zone,
        project_id=shared_project_id,
        reservation_name=reservation_name,
        min_replica=1,
        max_replica=2,
    )
    # 3. TODO: Implement prediction job here via custom component

    # 4. Teardown of resources post-prediction
    model_undeploy_op = ModelUndeployOp(
        endpoint=model_deploy_op.outputs["deployed_endpoint"],
        model=model_deploy_op.outputs["deployed_model"],
        # traffic_split={"0": 100} # Optional: to ensure all traffic is removed from this model_id
        # If this is the only model, it will be removed.
    )

    delete_endpoint_op = EndpointDeleteOp(
        endpoint=model_deploy_op.outputs[
            "deployed_endpoint"
        ],  # Use the same endpoint from deploy op
    )
    delete_endpoint_op.after(model_undeploy_op)  # Explicitly set dependency

## Compile the pipeline

In [11]:
kfp.compiler.Compiler().compile(
    pipeline_func=deploy_model_pipeline,
    package_path="predict_w_reservations.json",
)

In [12]:
# Replace with your project ID, region, etc.
pipeline_params = dict(
    project_id="wortz-project-352116",
    model="3416616934593003520",
    region="us-central1",
    shared_project_id="wortz-project-352116",
    zone="us-central1-b",
    reservation_name="a100-custom-image-reservation",
    endpoint_display_name="Reservation_Endpoint",
    deployed_model_display_name="My_deployed_model",
    accelerator_type="NVIDIA_TESLA_A100",
    machine_type="a2-highgpu-1g",
)

In [None]:
from google.cloud import aiplatform

aiplatform.init(
    project=pipeline_params["project_id"],
    location=pipeline_params["region"],
)
job = aiplatform.PipelineJob(
    display_name=f"Predictions with GPU Reservations",
    template_path="predict_w_reservations.json",
    # pipeline_root=f"gs://{bucket}/pipeline_runs",
    parameter_values=pipeline_params,
    project=pipeline_params["project_id"],
    location=pipeline_params["region"],
    enable_caching=True,
)

job.submit(service_account="vertex-sa@wortz-project-352116.iam.gserviceaccount.com")

Creating PipelineJob
PipelineJob created. Resource name: projects/679926387543/locations/us-central1/pipelineJobs/deploy-model-with-reserved-gpu-20250522183049
To use this PipelineJob in another session:
pipeline_job = aiplatform.PipelineJob.get('projects/679926387543/locations/us-central1/pipelineJobs/deploy-model-with-reserved-gpu-20250522183049')
View Pipeline Job:
https://console.cloud.google.com/vertex-ai/locations/us-central1/pipelines/runs/deploy-model-with-reserved-gpu-20250522183049?project=679926387543


In [None]:
import json

one_sample = datasets["test"].map(scale).take(1)
list_data = list(one_sample.as_numpy_iterator())
instances = {"instances": list_data[0][0].tolist()}
json_str = json.dumps(instances)
with open("data.json", "w", encoding="utf-8") as f:
    json.dump(json_str, f, ensure_ascii=False, indent=4)