# Batch Prediction Pipeline Workaround for Reservations

Reference docs: https://google-cloud-pipeline-components.readthedocs.io/en/google-cloud-pipeline-components-2.20.0/ 


This is a sample, some parameters may differ base on your implementation

## Create custom component that deploys a model to with a GPU reservation

In [9]:
import kfp
from kfp import dsl
from kfp.dsl import (
    Input,
    Output,
)  # Common artifact types
from google_cloud_pipeline_components.types.artifact_types import (
    VertexEndpoint,
    VertexModel,
)


@dsl.component(
    base_image="python:3.11",
    packages_to_install=["google-cloud-aiplatform", "google-cloud-pipeline-components"],
)
def create_endpoint_with_reservation(
    endpoint: Input[VertexEndpoint],
    model: str,
    deployed_name: str,
    machine_type: str,
    accelerator_type: str,
    accelerator_count: int,
    reservation_zone: str,
    project_id: str,
    reservation_name: str,
    min_replica: int,
    max_replica: int,
    location: str,
    deployed_endpoint: Output[VertexEndpoint],
    deployed_model: Output[VertexModel],
    endpoint_id: Output[str],
) -> None:
    from google_cloud_pipeline_components.types.artifact_types import (
        VertexModel,
    )
    from google.cloud import aiplatform

    aiplatform.init(
        project=project_id,
        location=location,
    )

    endpoint_fqn = endpoint.uri.split("v1/")[1]
    model_fqn = f"projects/{project_id}/locations/{location}/models/{model}"
    vertex_endpoint = aiplatform.Endpoint(endpoint_fqn)
    vertex_model = aiplatform.Model(model_name=model_fqn)

    vertex_endpoint.deploy(
        model=vertex_model,
        deployed_model_display_name=deployed_name,
        machine_type=machine_type,
        accelerator_type=accelerator_type,
        accelerator_count=accelerator_count,
        reservation_affinity_type="SPECIFIC_RESERVATION",
        reservation_affinity_key="compute.googleapis.com/reservation-name",
        reservation_affinity_values=[
            f"projects/{project_id}/zones/{reservation_zone}/reservations/{reservation_name}"
        ],
        min_replica_count=min_replica,
        max_replica_count=max_replica,
        sync=True,
    )
    # return types
    deployed_endpoint.uri = endpoint.uri
    deployed_model.uri = f"https://{location}-aiplatform.googleapis.com/v1/{model_fqn}"
    endpoint_id = endpoint.uri.split('/')[-1]

## Build another custom component that does the batch prediction from a gcs location
Note this is has some specific data manipulation to this model and may be different for other implementations

```bash
curl -L -o ~/Downloads/cifar10-python-in-csv.zip\
  https://www.kaggle.com/api/v1/datasets/download/fedesoriano/cifar10-python-in-csv
```

Unzip the file then upload the test file to the bucket.

```bash
gsutil cp test.csv gs://model_experimentation_2025/prediction_data/test.csv
```

#### Next, upload the Spark batch predict script that leverages the custom endpoint to storage

In [10]:
script_location = 'gs://model_experimentation_2025/scripts/spark_batch_predict.py'
! gsutil cp spark_batch_predict.py $script_location

Copying file://spark_batch_predict.py [Content-Type=text/x-python]...
/ [1 files][  2.8 KiB/  2.8 KiB]                                                
Operation completed over 1 objects/2.8 KiB.                                      


## Important - make sure you enable `roles/compute.viewer` permissions for your Vertex Service Account

In [31]:
#### Bind SA to the reservation
PROJECT_NUMBER = 679926387543

! gcloud compute reservations add-iam-policy-binding \
    a100-custom-image-reservation \
    --zone=us-central1-b \
    --member="serviceAccount:service-$PROJECT_NUMBER@gcp-sa-aiplatform.iam.gserviceaccount.com" \
    --role="roles/compute.viewer" \
    --project=wortz-project-352116

Updated IAM policy for reservation [a100-custom-image-reservation].
bindings:
- members:
  - serviceAccount:vertex-sa@wortz-project-352116.iam.gserviceaccount.com
  role: roles/compute.admin
- members:
  - serviceAccount:vertex-sa@wortz-project-352116.iam.gserviceaccount.com
  role: roles/compute.futureReservationAdmin
- members:
  - serviceAccount:vertex-sa@wortz-project-352116.iam.gserviceaccount.com
  role: roles/compute.instanceAdmin
- members:
  - serviceAccount:679926387543-compute@developer.gserviceaccount.com
  - serviceAccount:service-679926387543@gcp-sa-aiplatform.iam.gserviceaccount.com
  - serviceAccount:vertex-sa@wortz-project-352116.iam.gserviceaccount.com
  role: roles/compute.viewer
etag: BwY2nZYQIBE=
version: 1


## Pipeline with standard components integrated into the custom reservation deploy and batch predict

In [11]:
from kfp import dsl
from google_cloud_pipeline_components.v1.endpoint import (
    EndpointCreateOp,
    EndpointDeleteOp,
    ModelUndeployOp,
)
from google_cloud_pipeline_components.v1.dataproc import DataprocPySparkBatchOp

bucket = ("model_experimentation_2025",)


@dsl.pipeline(
    name="deploy-model-with-reserved-gpu",
    description="Deploys a model to an endpoint using a reserved GPU.",
)
def deploy_model_pipeline(
    project_id: str,
    model: str,
    region: str,
    shared_project_id: str,
    zone: str,
    reservation_name: str,
    endpoint_display_name: str,
    deployed_model_display_name: str,
    machine_type: str,
    accelerator_type: str,
    bucket: str,
    prediction_input_blob: str,
    prediction_output_blob: str,
):

    # 1. Create an endpoint
    create_endpoint_task = EndpointCreateOp(
        project=project_id,
        location=region,
        display_name=endpoint_display_name,
    )

    # 2. Deploy the model to the endpoint with reserved GPU
    model_deploy_op = create_endpoint_with_reservation(
        endpoint=create_endpoint_task.outputs["endpoint"],
        model=model,
        location=region,
        deployed_name=deployed_model_display_name,
        machine_type=machine_type,
        accelerator_type=accelerator_type,
        accelerator_count=1,
        reservation_zone=zone,
        project_id=shared_project_id,
        reservation_name=reservation_name,
        min_replica=1,
        max_replica=2,
    )
    # 3. Dataproc spark-based batch prediction job here
    batch_predict_op = DataprocPySparkBatchOp(
        main_python_file_uri="gs://model_experimentation_2025/scripts/spark_batch_predict.py",
        args=[
            project_id,
            region,
            model_deploy_op.outputs["endpoint_id"],
            bucket,
            prediction_input_blob,
            prediction_output_blob,
            30,
        ],
    )
    # 4. Teardown of resources post-prediction
    model_undeploy_op = ModelUndeployOp(
        endpoint=model_deploy_op.outputs["deployed_endpoint"],
        model=model_deploy_op.outputs["deployed_model"],
        # traffic_split={"0": 100} # Optional: to ensure all traffic is removed from this model_id
        # If this is the only model, it will be removed.
    ).after(batch_predict_op)

    delete_endpoint_op = EndpointDeleteOp(
        endpoint=model_deploy_op.outputs[
            "deployed_endpoint"
        ],  # Use the same endpoint from deploy op
    )
    delete_endpoint_op.after(batch_predict_op)  # Explicitly set dependency

## Compile the pipeline

In [12]:
kfp.compiler.Compiler().compile(
    pipeline_func=deploy_model_pipeline,
    package_path="predict_w_reservations.json",
)

In [15]:
# Replace with your project ID, region, etc.
import time

epoch_time = time.time()
pipeline_params = dict(
    project_id="wortz-project-352116",
    model="3416616934593003520",
    region="us-central1",
    shared_project_id="wortz-project-352116",
    zone="us-central1-b",
    reservation_name="a100-custom-image-reservation",
    endpoint_display_name="Reservation_Endpoint",
    deployed_model_display_name="My_deployed_model",
    accelerator_type="NVIDIA_TESLA_A100",
    machine_type="a2-highgpu-1g",
    bucket="model_experimentation_2025",
    prediction_input_blob="prediction_data/test.csv",
    prediction_output_blob=f"output_data/predictions_{epoch_time}.jsonl",
)

In [16]:
from google.cloud import aiplatform

aiplatform.init(
    project=pipeline_params["project_id"],
    location=pipeline_params["region"],
    service_account="vertex-sa@wortz-project-352116.iam.gserviceaccount.com",
)
job = aiplatform.PipelineJob(
    display_name=f"Predictions with GPU Reservations",
    template_path="predict_w_reservations.json",
    parameter_values=pipeline_params,
    project=pipeline_params["project_id"],
    location=pipeline_params["region"],
    enable_caching=True,
)

job.submit()

Creating PipelineJob
PipelineJob created. Resource name: projects/679926387543/locations/us-central1/pipelineJobs/deploy-model-with-reserved-gpu-20250603185647
To use this PipelineJob in another session:
pipeline_job = aiplatform.PipelineJob.get('projects/679926387543/locations/us-central1/pipelineJobs/deploy-model-with-reserved-gpu-20250603185647')
View Pipeline Job:
https://console.cloud.google.com/vertex-ai/locations/us-central1/pipelines/runs/deploy-model-with-reserved-gpu-20250603185647?project=679926387543


In [None]:
#### WIP

In [1]:
# import json

# one_sample = datasets["test"].map(scale).take(1)
# list_data = list(one_sample.as_numpy_iterator())
# instances = {"instances": list_data[0][0].tolist()}
# json_str = json.dumps(instances)
# with open("data.json", "w", encoding="utf-8") as f:
#     json.dump(json_str, f, ensure_ascii=False, indent=4)