In [None]:
# Copyright 2025 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Vertex AI Training using Private Service Connect interface

<table align="left">
  <td style="text-align: center">
    <a href="https://colab.research.google.com/github/jbrache/vertex-ai-things/blob/main/MLOps/Training/Vertex_Training_PSC_I.ipynb">
      <img width="32px" src="https://www.gstatic.com/pantheon/images/bigquery/welcome_page/colab-logo.svg" alt="Google Colaboratory logo"><br> Open in Colab
    </a>
  </td>
  <td style="text-align: center">
    <a href="https://console.cloud.google.com/vertex-ai/colab/import/https:%2F%2Fraw.githubusercontent.com%2Fjbrache%2Fvertex-ai-things%2Fmain%2FMLOps%2FTraining%2FVertex_Training_PSC_I.ipynb">
      <img width="32px" src="https://lh3.googleusercontent.com/JmcxdQi-qOpctIvWKgPtrzZdJJK-J3sWE1RsfjZNwshCFgE_9fULcNpuXYTilIR2hjwN" alt="Google Cloud Colab Enterprise logo"><br> Open in Colab Enterprise
    </a>
  </td>
  <td style="text-align: center">
    <a href="https://console.cloud.google.com/vertex-ai/workbench/deploy-notebook?download_url=https://raw.githubusercontent.com/jbrache/vertex-ai-things/blob/main/MLOps/Training/Vertex_Training_PSC_I.ipynb">
      <img src="https://www.gstatic.com/images/branding/gcpiconscolors/vertexai/v1/32px.svg" alt="Vertex AI logo"><br> Open in Vertex AI Workbench
    </a>
  </td>
  <td style="text-align: center">
    <a href="https://github.com/jbrache/vertex-ai-things/blob/main/MLOps/Training/Vertex_Training_PSC_I.ipynb">
      <img width="32px" src="https://www.svgrepo.com/download/217753/github.svg" alt="GitHub logo"><br> View on GitHub
    </a>
  </td>
</table>

| Author(s) |
| --- |
| [Jose Brache](https://github.com/jbrache) |

## Overview

Private Service Connect interface is recommended for private connectivity since it reduces the chance of IP exhaustion and allows for transitive peering. See [Set up a Private Service Connect interface](https://cloud.google.com/vertex-ai/docs/general/vpc-psc-i-setup) on how to configure it for Vertex AI.

Review the public docs for the latest information on support: [Schedule training jobs based on resource availability](https://cloud.google.com/vertex-ai/docs/training/psc-i-egress)

This example covers the following steps:
1. Setup multiple projects: Project 'a' (VPC Host Project), and Project 'b' (Vertex AI Project)
2. Create Test Training Code Container
3. Build/Push Custom Docker Container using Cloud Build
4. Setup PSC-I in VPC Host Project for Vertex AI Training
5. Prepare Training Job
6. Submit Training Job with PSC-I
7. Clean Up

## Get started

### Install Vertex AI SDK and other required packages

In [None]:
# import sys

# if "google.colab" in sys.modules:
#     !pip3 install --upgrade google-cloud-aiplatform

#     # Restart the notebook kernel after installs.
#     import IPython

#     app = IPython.Application.instance()
#     app.kernel.do_shutdown(True)

---

#### ⚠️ Do not forget to click the "RESTART RUNTIME" button above.

---

If you install additional packages, it's suggested to restart the notebook kernel so it can find the packages.

### Authenticate your notebook environment (Colab only)

If you're running this notebook on Google Colab, run the cell below to authenticate your environment.

In [None]:
import sys

if "google.colab" in sys.modules:
    from google.colab import auth

    auth.authenticate_user()

# 0-0. Set Google Cloud project information

To get started using Vertex AI, you must have an existing Google Cloud project and [enable the Vertex AI API](https://console.cloud.google.com/flows/enableapi?apiid=aiplatform.googleapis.com).

Learn more about [setting up a project and a development environment](https://cloud.google.com/vertex-ai/docs/start/cloud-environment).

In [None]:
############## Set Project ###########################
REGION = "us-central1"  # @param {type: "string"}

# Calling this Project 'a' where the VPC Host project exists
PROJECT_ID_VPC_HOST = "vpc-host-jb01"  # @param {type:"string"}
VPC_NAME = "vertex-vpc-prod"
SUBNET_NAME = f"{REGION}-vertex"
NETWORK_ATTACHMENT_NAME = f"vertex-attachment-{REGION}"

# Calling this Project 'b' where Vertex AI Training jobs run
PROJECT_ID = "ds-dev-jb01"  # @param {type:"string"}

BUCKET_URI = f"gs://{PROJECT_ID}-artifacts"

############ Setting the Enviroment ###########
SERVICE ="aiplatform.googleapis.com"
ENDPOINT="us-central1-aiplatform.googleapis.com"

# Set the project id
!gcloud config set project {PROJECT_ID}

## 0-1. Enable APIs
The following APIs are enabled in this demo:
1. [Enable the Vertex AI API](https://console.cloud.google.com/flows/enableapi?apiid=aiplatform.googleapis.com)
2. [Enable the Cloud Build API](https://console.cloud.google.com/flows/enableapi?apiid=cloudbuild.googleapis.com)
3. [Enable the Artifact Registry API](https://console.cloud.google.com/flows/enableapi?apiid=artifactregistry.googleapis.com): You must enable the Artifact Registry API for your project. You will store your custom training container in Artifact Registry. [Learn more about Enabling the Artifact Registry service](https://cloud.google.com/artifact-registry/docs/enable-service)
4. [Enable the Service Networking API](https://console.cloud.google.com/flows/enableapi?apiid=servicenetworking.googleapis.com)


In [None]:
############# Enable the APIs for Vertex AI Project ########################
!gcloud services enable --project=$PROJECT_ID aiplatform.googleapis.com artifactregistry.googleapis.com cloudbuild.googleapis.com --project $PROJECT_ID

In [None]:
############# Enable the APIs for VPC Host Project ########################
!gcloud services enable --project=$PROJECT_ID_VPC_HOST aiplatform.googleapis.com compute.googleapis.com servicenetworking.googleapis.com --project $PROJECT_ID_VPC_HOST

## 0-2. Import libraries

In [None]:
import os
import sys
import requests
import json

## 0-3. Create a Cloud Storage bucket

Create a storage bucket to store intermediate artifacts such as datasets.

**Only if your bucket doesn't already exist**: Run the following cell to create your Cloud Storage bucket.

In [None]:
create_bucket = False
if create_bucket:
    !gsutil mb -l {REGION} -p {PROJECT_ID} {BUCKET_URI}

## 0-3. Training Code Container

In [None]:
# Training code container def
CONTAINER_DIR = "test_container"

Verify the location where the training code exists

In [None]:
# Remove if there's any such folder already
!rm -rf $CONTAINER_DIR
# Create your app directory
!mkdir -p $CONTAINER_DIR/trainer
# Create a subdirectory for store the training scripts
!touch $CONTAINER_DIR/trainer/train.py
# Create the init file
!touch $CONTAINER_DIR/trainer/__init__.py

In [None]:
# Print the location where the training code exists, will be used to create the artifact registry container
print(f"Local container directory with training code: {CONTAINER_DIR}")
print("Check whether the container directory exists:", os.path.exists(CONTAINER_DIR))

In [None]:
%%writefile $CONTAINER_DIR/trainer/train.py
import argparse
import logging
import sys
import os
import time
import json

def parse_args():
  """Parses command-line arguments."""
  """Argument parser.

  Returns:
    Dictionary of arguments.
  """
  parser = argparse.ArgumentParser()

  parser.add_argument('--log-level', help='Logging level.', choices=['DEBUG', 'ERROR', 'FATAL', 'INFO', 'WARN'], default='INFO')
  parser.add_argument('--sleep', help='Logging level.', type=str, default='600s')
  parsed, unknown = parser.parse_known_args()
  return parsed, unknown

if __name__ == '__main__':
  """Entry point"""
  arguments, unknown_args = parse_args()
  logging.basicConfig(level=arguments.log_level)

  if arguments.sleep[-1] == "s":
    sleep = int(arguments.sleep[:-1])
  else:
    sleep = int(arguments.sleep)

  # Sleeping 600 seconds to connect the web shell
  logging.info(f'Sleeping for {sleep} seconds...')
  time.sleep(sleep)

In [None]:
%%writefile $CONTAINER_DIR/requirements.txt

In [None]:
%%writefile $CONTAINER_DIR/pyproject.toml
[tool.poetry]
package-mode = false

[tool.poetry.dependencies]
python = "==3.10.12"

In [None]:
%%writefile $CONTAINER_DIR/Dockerfile
# https://medium.com/@albertazzir/blazing-fast-python-docker-builds-with-poetry-a78a66f5aed0

# Fetch the base image
# https://hub.docker.com/r/amd64/python/
# The builder image, used to build the virtual environment
FROM amd64/python:3.10.12

# Install pipx
RUN apt-get update && \
    apt-get install --no-install-suggests --no-install-recommends --yes pipx

# Install network tools: ping, dig, nslookup
RUN apt-get update && \
    apt-get install -y net-tools iputils-ping tcpdump dnsutils

ENV PATH="/root/.local/bin:${PATH}"
RUN pipx install poetry
RUN pipx inject poetry poetry-plugin-bundle

ENV POETRY_NO_INTERACTION=1 \
    POETRY_VIRTUALENVS_IN_PROJECT=1 \
    POETRY_VIRTUALENVS_CREATE=1 \
    POETRY_CACHE_DIR=/tmp/poetry_cache

# Set the working dir for the rest of the commands
WORKDIR /

# COPY requirements.txt .
COPY pyproject.toml .
# COPY poetry.lock .

RUN poetry install --no-root && rm -rf $POETRY_CACHE_DIR

ENV VIRTUAL_ENV=/.venv \
    PATH="/.venv/bin:$PATH"

# Copies the trainer code to the docker image.
COPY trainer /trainer

# Sets up the entry point to invoke the trainer.
ENTRYPOINT [ "poetry", "run", "python", "-m", "trainer.train" ]

# 1-0. Build and Push Custom Container to Artifact Registry

You must have enabled the Artifact Registry API for your project in the previous steps. You will store your custom training container in Artifact Registry.

## 1-1. Create a private Docker repository
Your first step is to create a Docker repository in Artifact Registry.

1 - Run the `gcloud artifacts repositories create` command to create a new Docker repository with your region with the description `Docker repository`.

2 - Run the `gcloud artifacts repositories list` command to verify that your repository was created.

In [None]:
# Repo to create / use for running training job
PRIVATE_REPO = "test-training"
TRAIN_IMAGE = (
    f"{REGION}-docker.pkg.dev/{PROJECT_ID}/{PRIVATE_REPO}/test:latest"
)
print("Private Repo:", PRIVATE_REPO)
print("Training Container Image:", TRAIN_IMAGE)

In [None]:
!gcloud artifacts repositories create {PRIVATE_REPO} --repository-format=docker --project={PROJECT_ID} --location={REGION} --description="Docker repository"

In [None]:
!gcloud artifacts repositories --project={PROJECT_ID} list

## 1-2. Build and push the custom docker container image by using Cloud Build

Build and push a Docker image with Cloud Build

In [None]:
!cd $CONTAINER_DIR && gcloud builds submit --timeout=1800s --project={PROJECT_ID} --region={REGION} --tag {TRAIN_IMAGE}

# 2-0. Set up private services access for your VPC

Following this guide for [setting up a Private Service Connect interface for Vertex AI resources](https://cloud.google.com/vertex-ai/docs/general/vpc-psc-i-setup).

In [None]:
PROJECT_NUMBER_VPC_HOST = !(gcloud projects describe $PROJECT_ID_VPC_HOST --format="value(projectNumber)")
PROJECT_NUMBER_VPC_HOST = PROJECT_NUMBER_VPC_HOST[0]

PROJECT_NUMBER = !(gcloud projects describe $PROJECT_ID --format="value(projectNumber)")
PROJECT_NUMBER = PROJECT_NUMBER[0]

In [None]:
print("PROJECT_ID:", PROJECT_ID)
print("PROJECT_NUMBER:", PROJECT_NUMBER)
print("----------")
print("PROJECT_ID_VPC_HOST:", PROJECT_ID_VPC_HOST)
print("PROJECT_NUMBER_VPC_HOST:", PROJECT_NUMBER_VPC_HOST)

In [None]:
FULL_NETWORK_ATTACHMENT_NAME = f"projects/{PROJECT_ID_VPC_HOST}/regions/{REGION}/networkAttachments/{NETWORK_ATTACHMENT_NAME}"
FULL_NETWORK_ATTACHMENT_NAME = f"projects/{PROJECT_NUMBER_VPC_HOST}/regions/{REGION}/networkAttachments/{NETWORK_ATTACHMENT_NAME}"

FILESTORE_ZONE = "us-central1-a"

print(FULL_NETWORK_ATTACHMENT_NAME)

## 2-1. IAM Bindings

In [None]:
AI_PLATFORM_SERVICE_AGENT = f"service-{PROJECT_NUMBER}@gcp-sa-aiplatform.iam.gserviceaccount.com"

!gcloud projects add-iam-policy-binding $PROJECT_ID_VPC_HOST \
  --member="serviceAccount:$AI_PLATFORM_SERVICE_AGENT"  \
  --role="roles/compute.networkAdmin"

In [None]:
!gcloud projects add-iam-policy-binding $PROJECT_ID_VPC_HOST \
  --member="serviceAccount:service-$PROJECT_NUMBER@compute-system.iam.gserviceaccount.com"  \
  --role="roles/compute.networkAdmin"

In [None]:
TRAINING_SA = f"{PROJECT_NUMBER}-compute@developer.gserviceaccount.com"

!gcloud projects add-iam-policy-binding $PROJECT_ID_VPC_HOST \
  --member="serviceAccount:$TRAINING_SA"  \
  --role="roles/compute.networkUser"

In [None]:
AI_PLATFORM_SERVICE_AGENT = f"service-{PROJECT_NUMBER_VPC_HOST}@gcp-sa-aiplatform.iam.gserviceaccount.com"

!gcloud projects add-iam-policy-binding $PROJECT_ID_VPC_HOST \
  --member="serviceAccount:$AI_PLATFORM_SERVICE_AGENT"  \
  --role="roles/compute.networkAdmin"

## 2-1. Create VPC Network

In [None]:
!gcloud compute networks create {VPC_NAME} \
    --project={PROJECT_ID_VPC_HOST} \
    --subnet-mode custom

## 2-2. Configure following firewalls rules

In [None]:
!gcloud compute firewall-rules create {VPC_NAME}-firewall1 --network {VPC_NAME} --allow tcp:22 --project {PROJECT_ID_VPC_HOST}

!gcloud compute firewall-rules create {VPC_NAME}-firewall2 --network {VPC_NAME} --allow tcp:3389 --project {PROJECT_ID_VPC_HOST}

!gcloud compute firewall-rules create {VPC_NAME}-firewall3 --network {VPC_NAME} --allow icmp --project {PROJECT_ID_VPC_HOST}

## 2-3. Create VPC Subnet

In [None]:
!gcloud compute networks subnets create {REGION} \
    --network {VPC_NAME} \
    --range 10.0.0.0/16 \
    --enable-private-ip-google-access \
    --region={REGION} \
    --project={PROJECT_ID_VPC_HOST}

## 2-4. Create Network Attachment

In [None]:
!gcloud compute network-attachments create {NETWORK_ATTACHMENT_NAME} \
    --region={REGION} \
    --connection-preference=ACCEPT_MANUAL \
    --subnets={SUBNET_NAME} \
    --project={PROJECT_ID_VPC_HOST}

In [None]:
NETWORK_NAME = f"projects/{PROJECT_NUMBER_VPC_HOST}/global/networks/{VPC_NAME}"
print(NETWORK_NAME)

In [None]:
# @title [Optional]: Setup your NFS instance
# !gcloud config set filestore/zone {ZONE}

# !gcloud filestore instances create {INSTANCE_NAME} \
#     --tier=TIER \
#     --file-share=name="file_share_name",capacity={SIZE} \
#     --network=name={NETWORK}

# 3-0. Run custom training jobs with PSC-I

This section creates a custom training job with a Private Service Connect interface. View the [documentation page](https://cloud.google.com/vertex-ai/docs/training/psc-i-egress) for the most up to date information.

## 3-1. Prepare training jobs

Vertex AI Training supports submiting custom training jobs with a prebuilt container, custom container and python application via **HTTP request, Vertex AI SDK or gcloud CLI**. Learn more [here](https://cloud.google.com/vertex-ai/docs/training/code-requirements).

In this example, we will demonstrate how to run a custom job with with custom containers. Please specify the images below to your custom images.
Note, if it's not a public image, please ensure it's already pushed to your project.

https://cloud.google.com/vertex-ai/docs/training/containers-overview

In [None]:
# @title Function Defs
### Create CPU Test Training Job
def create_single_replica_cpu_job(network_peering: str = None, network_attachment: str = None, enable_web_access: bool = False):
  ############ Set Job Service Endpoint ################
  job_request_uri = f"https://{ENDPOINT}/v1beta1/projects/{PROJECT_ID}/locations/{REGION}/customJobs"

  bearer_token = !gcloud auth application-default print-access-token
  headers = {
      'Content-Type': 'application/json',
      'Authorization': 'Bearer {}'.format(bearer_token[0]),
  }

  print("Calling endpoint: {} to create custom training job".format(job_request_uri))
  cpu_single_job = {
    "display_name": "CPU Test Job",
    "job_spec": {
        "worker_pool_specs": [
          CPU_WORKER_SPEC,
        ],
        "service_account": TRAINING_SA,
    },
    "labels": {
        "network_type": "none"
    }
  }

  if network_attachment and (network_peering == None):
    psc_interface_config = {
        "network_attachment": network_attachment
    }
    labels = {
        "network_type": "psc-i"
    }
    cpu_single_job["job_spec"]["psc_interface_config"] = psc_interface_config
    cpu_single_job["labels"] = labels

  if (network_attachment == None) and network_peering:
    labels = {
        "network_type": "peering"
    }
    cpu_single_job["job_spec"]["network"] = network_peering
    cpu_single_job["labels"] = labels

  if enable_web_access:
    cpu_single_job["job_spec"]["enable_web_access"] = enable_web_access

  print(json.dumps(cpu_single_job, indent=2))

  response = requests.post(job_request_uri, json=cpu_single_job, headers=headers)

  print("response:", response)
  job_name = None
  job_id = None
  if response.reason == 'OK':
    job_name = response.json()['name']
    job_id = job_name.split('/')[-1]
    url = f"console.cloud.google.com/vertex-ai/locations/{REGION}/training/{job_id}/cpu?&project={PROJECT_ID}"
    print("Created Job: ", response.json()['name'])
    print("Link:", "https://"+url)
  else:
    print(response.text)
  return job_name, job_id, response

In [None]:
# Repo to create / use for running training job
PRIVATE_REPO = "test-training"
TRAIN_IMAGE = (
    f"{REGION}-docker.pkg.dev/{PROJECT_ID}/{PRIVATE_REPO}/test:latest"
)
print("Private Repo:", PRIVATE_REPO)
print("Training Container Image:", TRAIN_IMAGE)

In [None]:
PROJECT_NUMBER_VPC_HOST = !(gcloud projects describe $PROJECT_ID_VPC_HOST --format="value(projectNumber)")
PROJECT_NUMBER_VPC_HOST = PROJECT_NUMBER_VPC_HOST[0]

PROJECT_NUMBER = !(gcloud projects describe $PROJECT_ID --format="value(projectNumber)")
PROJECT_NUMBER = PROJECT_NUMBER[0]

In [None]:
print("PROJECT_ID:", PROJECT_ID)
print("PROJECT_NUMBER:", PROJECT_NUMBER)
print("----------")
print("PROJECT_ID_VPC_HOST:", PROJECT_ID_VPC_HOST)
print("PROJECT_NUMBER_VPC_HOST:", PROJECT_NUMBER_VPC_HOST)

In [None]:
# FULL_NETWORK_ATTACHMENT_NAME = f"projects/{PROJECT_ID_VPC_HOST}/regions/{REGION}/networkAttachments/{NETWORK_ATTACHMENT_NAME}"
FULL_NETWORK_ATTACHMENT_NAME = f"projects/{PROJECT_NUMBER_VPC_HOST}/regions/{REGION}/networkAttachments/{NETWORK_ATTACHMENT_NAME}"

print(FULL_NETWORK_ATTACHMENT_NAME)

In [None]:
# Prepare training images
CPU_IMAGE = TRAIN_IMAGE
JOB_DIR = BUCKET_URI
print("Using image: ", CPU_IMAGE)
print("Output Model Directory: ", JOB_DIR)
print("Network Attachment: ", FULL_NETWORK_ATTACHMENT_NAME)

## 3-2. Training Job Config

In [None]:
CPU_MACHINE_TYPE = "n2-standard-4"  # @param {type:"string"}
TRAINING_SA = f"{PROJECT_NUMBER}-compute@developer.gserviceaccount.com"

In [None]:
############ Set Job Service Endpoint ################
job_request_uri = f"https://{ENDPOINT}/v1beta1/projects/{PROJECT_ID}/locations/{REGION}/customJobs"

# This is consistent with the default disk spec of jobs.
DISK_SPEC = {
  "boot_disk_type": "pd-ssd",
  "boot_disk_size_gb": 100
}

############ Set WorkerPool Spec #####################
# https://cloud.google.com/vertex-ai/docs/reference/rest/v1/CustomJobSpec
CPU_WORKER_SPEC = {
  "machine_spec": {
    "machine_type": CPU_MACHINE_TYPE,
  },
  "replica_count": 1,
  "container_spec": {
    "image_uri": CPU_IMAGE,
    "command": [],
    "args": [
        '--sleep=600s',
    ],
  },
  "disk_spec": DISK_SPEC
}

print('CPU worker spec:\n', json.dumps(CPU_WORKER_SPEC, indent=2))

## 3-3. Create CPU test job on Vertex AI Training

In [None]:
job_name, job_id, custom_job_response = create_single_replica_cpu_job(network_peering=None,
                                                                      network_attachment=FULL_NETWORK_ATTACHMENT_NAME,
                                                                      enable_web_access=True)

In [None]:
print(json.dumps(json.loads(custom_job_response.text), indent=2))

# 4-0. Get Job Details

In [None]:
# Option 1: Use the Custom Job ID to get details
# JOB_ID = "" # @param {type:"string"}
# !gcloud beta ai custom-jobs describe {JOB_ID} --project={PROJECT_ID} --region={REGION}

In [None]:
# Option 2: List existing custom jobs, filter running jobs and ones with the set label
# Lists the existing custom jobs, filters with the label set for these jobs
FILTER = '"(state!="JOB_STATE_SUCCEEDED" AND state!="JOB_STATE_FAILED" AND state!="JOB_STATE_CANCELLED") AND labels.network_type=psc-i"'
!gcloud beta ai custom-jobs list --project={PROJECT_ID} --region={REGION} --filter={FILTER}

# 5-0. Cleaning up

To clean up all Google Cloud resources used in this project, you can [delete the Google Cloud
project](https://cloud.google.com/resource-manager/docs/creating-managing-projects#shutting_down_projects) you used for the tutorial.

Otherwise, you can delete the individual resources you created in this tutorial:

- Cloud Storage Bucket
- Artifacts Repository
- VPC Network

In [None]:
# Set this to true only if you'd like to delete your bucket
delete_bucket = False
delete_artifacts_repo = False

if delete_bucket:
  !gsutil rm -rf $JOB_DIR
  !gsutil rm -r $BUCKET_URI

if delete_artifacts_repo:
  !gcloud artifacts repositories delete {PRIVATE_REPO} --project={PROJECT_ID} --location={REGION} --quiet