In [None]:
# Copyright 2025 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Vertex AI Training with VPC Network Peering

<table align="left">
  <td style="text-align: center">
    <a href="https://colab.research.google.com/github/jbrache/vertex-ai-things/blob/main/MLOps/Training/Vertex_Training_with_VPC_Network_Peering.ipynb">
      <img width="32px" src="https://www.gstatic.com/pantheon/images/bigquery/welcome_page/colab-logo.svg" alt="Google Colaboratory logo"><br> Open in Colab
    </a>
  </td>
  <td style="text-align: center">
    <a href="https://console.cloud.google.com/vertex-ai/colab/import/https:%2F%2Fraw.githubusercontent.com%2Fjbrache%2Fvertex-ai-things%2Fmain%2FMLOps%2FTraining%2FVertex_Training_with_VPC_Network_Peering.ipynb">
      <img width="32px" src="https://lh3.googleusercontent.com/JmcxdQi-qOpctIvWKgPtrzZdJJK-J3sWE1RsfjZNwshCFgE_9fULcNpuXYTilIR2hjwN" alt="Google Cloud Colab Enterprise logo"><br> Open in Colab Enterprise
    </a>
  </td>
  <td style="text-align: center">
    <a href="https://console.cloud.google.com/vertex-ai/workbench/deploy-notebook?download_url=https://raw.githubusercontent.com/jbrache/vertex-ai-things/blob/main/MLOps/Training/Vertex_Training_with_VPC_Network_Peering.ipynb">
      <img src="https://www.gstatic.com/images/branding/gcpiconscolors/vertexai/v1/32px.svg" alt="Vertex AI logo"><br> Open in Vertex AI Workbench
    </a>
  </td>
  <td style="text-align: center">
    <a href="https://github.com/jbrache/vertex-ai-things/blob/main/MLOps/Training/Vertex_Training_with_VPC_Network_Peering.ipynb">
      <img width="32px" src="https://www.svgrepo.com/download/217753/github.svg" alt="GitHub logo"><br> View on GitHub
    </a>
  </td>
</table>

| Author(s) |
| --- |
| [Jose Brache](https://github.com/jbrache) |

## Overview

You can configure [Vertex AI to peer with Virtual Private Cloud](https://cloud.google.com/vertex-ai/docs/general/vpc-peering) (VPC) to connect directly with certain resources in Vertex AI.

This guide shows how to set up VPC Network Peering in a VPC Host Project to peer your network with Vertex AI Training. This guide is recommended for networking administrators who are already familiar with Google Cloud networking concepts.

Review the public docs for the latest information on support:
- [Set up VPC Network Peering for certain Vertex AI resources](https://cloud.google.com/vertex-ai/docs/general/vpc-peering)
- [Set up Connectivity from Vertex AI to Other Networks](https://cloud.google.com/vertex-ai/docs/general/hybrid-connectivity)

This example covers the following steps:
1. Setup projects as the **VPC Host Project (Project 'a')** and the **Vertex AI Project (Project 'b')**
2. Create Test Training Code Container
3. Build/Push Custom Docker Container using Cloud Build
4. Setup VPC Peering in VPC Host Project for Vertex AI Training
5. Prepare Training Job
6. Submit Training Job with VPC Network Peering in **us-central1**
7. Submit Training Job with VPC Network Peering in **us-west1**
8. Clean Up

## Get started

### Install Vertex AI SDK and other required packages

In [None]:
# import sys

# if "google.colab" in sys.modules:
#     !pip3 install --upgrade google-cloud-aiplatform

#     # Restart the notebook kernel after installs.
#     import IPython

#     app = IPython.Application.instance()
#     app.kernel.do_shutdown(True)

---

#### ⚠️ Do not forget to click the "RESTART RUNTIME" button above.

---

If you install additional packages, it's suggested to restart the notebook kernel so it can find the packages.

### Authenticate your notebook environment (Colab only)

If you're running this notebook on Google Colab, run the cell below to authenticate your environment.

In [None]:
import sys

if "google.colab" in sys.modules:
    from google.colab import auth

    auth.authenticate_user()

# 0-0. Set Google Cloud project information

To get started using Vertex AI, you must have an existing Google Cloud project and [enable the Vertex AI API](https://console.cloud.google.com/flows/enableapi?apiid=aiplatform.googleapis.com).

Learn more about [setting up a project and a development environment](https://cloud.google.com/vertex-ai/docs/start/cloud-environment).

In [None]:
############## Set Region for Shared Resources ###########################
REGION = "us-central1"  # @param {type: "string"}

############## Set VPC Host Project ###########################
# Calling this Project 'a' where the VPC Host project exists
PROJECT_ID_VPC_HOST = "ds-dev-jb02-psci"  # @param {type:"string"}
VPC_NAME = "vertex-vpc-prod"

# Region #1
REGION_CENTRAL = "us-central1"  # @param {type: "string"}
SUBNET_NAME_CENTRAL = f"{REGION_CENTRAL}-vertex-peering"

# Region #2
REGION_WEST = "us-west1"  # @param {type: "string"}
SUBNET_NAME_WEST = f"{REGION_WEST}-vertex-peering"

############## Set Vertex AI Project ###########################
# Calling this Project 'b' where Vertex AI Training jobs run
PROJECT_ID = "ds-dev-jb02-psci"  # @param {type:"string"}

# Set the project id
!gcloud config set project {PROJECT_ID}

## 0-1. Enable APIs
The following APIs are enabled in this demo:
1. [Enable the Vertex AI API](https://console.cloud.google.com/flows/enableapi?apiid=aiplatform.googleapis.com)
2. [Enable the Cloud Build API](https://console.cloud.google.com/flows/enableapi?apiid=cloudbuild.googleapis.com)
3. [Enable the Artifact Registry API](https://console.cloud.google.com/flows/enableapi?apiid=artifactregistry.googleapis.com): You must enable the Artifact Registry API for your project. You will store your custom training container in Artifact Registry. [Learn more about Enabling the Artifact Registry service](https://cloud.google.com/artifact-registry/docs/enable-service)
4. [Enable the Service Networking API](https://console.cloud.google.com/flows/enableapi?apiid=servicenetworking.googleapis.com)


In [None]:
############# Enable the APIs for Vertex AI Project ########################
!gcloud services enable aiplatform.googleapis.com artifactregistry.googleapis.com cloudbuild.googleapis.com servicenetworking.googleapis.com --project $PROJECT_ID

In [None]:
############# Enable the APIs for VPC Host Project ########################
!gcloud services enable compute.googleapis.com servicenetworking.googleapis.com --project $PROJECT_ID_VPC_HOST

## 0-2. Import libraries

In [None]:
import json
import logging
import traceback
import sys
import os

## 0-3. Training Code Container

In [None]:
# Training code container def
CONTAINER_DIR = "test_container"

Verify the location where the training code exists

In [None]:
# Remove if there's any such folder already
!rm -rf $CONTAINER_DIR
# Create your app directory
!mkdir -p $CONTAINER_DIR/trainer
# Create a subdirectory for store the training scripts
!touch $CONTAINER_DIR/trainer/train.py
# Create the init file
!touch $CONTAINER_DIR/trainer/__init__.py

In [None]:
# Print the location where the training code exists, will be used to create the artifact registry container
print(f"Local container directory with training code: {CONTAINER_DIR}")
print("Check whether the container directory exists:", os.path.exists(CONTAINER_DIR))

In [None]:
%%writefile $CONTAINER_DIR/trainer/train.py
import argparse
import logging
import sys
import os
import time
import json

def parse_args():
  """Parses command-line arguments."""
  """Argument parser.

  Returns:
    Dictionary of arguments.
  """
  parser = argparse.ArgumentParser()

  parser.add_argument('--log-level', help='Logging level.', choices=['DEBUG', 'ERROR', 'FATAL', 'INFO', 'WARN'], default='INFO')
  parser.add_argument('--sleep', help='Amount of time in seconds to sleep.', type=str, default='600s')
  parsed, unknown = parser.parse_known_args()
  return parsed, unknown

if __name__ == '__main__':
  """Entry point"""
  arguments, unknown_args = parse_args()
  logging.basicConfig(level=arguments.log_level)

  if arguments.sleep[-1] == "s":
    sleep = int(arguments.sleep[:-1])
  else:
    sleep = int(arguments.sleep)

  # Sleeping 600 seconds to connect the web shell
  logging.info(f'Sleeping for {sleep} seconds...')
  time.sleep(sleep)

In [None]:
%%writefile $CONTAINER_DIR/requirements.txt

In [None]:
%%writefile $CONTAINER_DIR/pyproject.toml
[tool.poetry]
package-mode = false

[tool.poetry.dependencies]
python = "==3.10.12"

In [None]:
%%writefile $CONTAINER_DIR/Dockerfile
# https://medium.com/@albertazzir/blazing-fast-python-docker-builds-with-poetry-a78a66f5aed0

# Fetch the base image
# https://hub.docker.com/r/amd64/python/
# FROM amd64/python:3.10.12
# The builder image, used to build the virtual environment
FROM python:3.10.12-slim-bookworm

# Install pipx
RUN apt-get update && \
    apt-get install --no-install-suggests --no-install-recommends --yes pipx

# Install network tools: ping, dig, nslookup
RUN apt-get update && \
    apt-get install -y net-tools iputils-ping tcpdump dnsutils

ENV PATH="/root/.local/bin:${PATH}"
RUN pipx install poetry
RUN pipx inject poetry poetry-plugin-bundle

ENV POETRY_NO_INTERACTION=1 \
    POETRY_VIRTUALENVS_IN_PROJECT=1 \
    POETRY_VIRTUALENVS_CREATE=1 \
    POETRY_CACHE_DIR=/tmp/poetry_cache

# Set the working dir for the rest of the commands
WORKDIR /

# COPY requirements.txt .
COPY pyproject.toml .
# COPY poetry.lock .

RUN poetry install --no-root && rm -rf $POETRY_CACHE_DIR

ENV VIRTUAL_ENV=/.venv \
    PATH="/.venv/bin:$PATH"

# Copies the trainer code to the docker image.
COPY trainer /trainer

# Sets up the entry point to invoke the trainer.
ENTRYPOINT [ "poetry", "run", "python", "-m", "trainer.train" ]

# 1-0. Build and Push Custom Container to Artifact Registry

You must have enabled the Artifact Registry API for your project in the previous steps. You will store your custom training container in Artifact Registry.

## 1-1. Create a private Docker repository
Your first step is to create a Docker repository in Artifact Registry.

1 - Run the `gcloud artifacts repositories create` command to create a new Docker repository with your region with the description `Docker repository`.

2 - Run the `gcloud artifacts repositories list` command to verify that your repository was created.

In [None]:
# Repo to create / use for running training job
PRIVATE_REPO = "test-training"
TRAIN_IMAGE = (
    f"{REGION}-docker.pkg.dev/{PROJECT_ID}/{PRIVATE_REPO}/test:latest"
)
print("Private Repo:", PRIVATE_REPO)
print("Training Container Image:", TRAIN_IMAGE)

In [None]:
!gcloud artifacts repositories create {PRIVATE_REPO} --repository-format=docker --project={PROJECT_ID} --location={REGION} --description="Docker repository"

In [None]:
!gcloud artifacts repositories --project={PROJECT_ID} list

## 1-2. Build and push the custom docker container image by using Cloud Build

Build and push a Docker image with Cloud Build

In [None]:
!cd $CONTAINER_DIR && gcloud builds submit --timeout=1800s --project={PROJECT_ID} --region={REGION} --tag {TRAIN_IMAGE}

# 2-0. Set up private services access for your VPC

Following this guide for [creating the peering range](https://cloud.google.com/vertex-ai/docs/general/vpc-peering)

In [None]:
PROJECT_NUMBER_VPC_HOST=!(gcloud projects describe $PROJECT_ID_VPC_HOST --format="value(projectNumber)")
PROJECT_NUMBER_VPC_HOST = PROJECT_NUMBER_VPC_HOST[0]

In [None]:
print(PROJECT_NUMBER_VPC_HOST)
print(f"projects/{PROJECT_NUMBER_VPC_HOST}/global/networks/{VPC_NAME}")

## 2-1. Create VPC Network

In [None]:
!gcloud compute networks create {VPC_NAME} \
    --project={PROJECT_ID_VPC_HOST} \
    --subnet-mode custom

## 2-2. Create VPC Subnet

In [None]:
!gcloud compute networks subnets create {SUBNET_NAME_CENTRAL} \
    --network {VPC_NAME} \
    --range 192.167.0.0/19 \
    --enable-private-ip-google-access \
    --region={REGION_CENTRAL} \
    --project={PROJECT_ID_VPC_HOST}

In [None]:
# Unused
# !gcloud compute networks subnets create {SUBNET_NAME_WEST} \
#     --network {VPC_NAME} \
#     --range 192.167.0.0/19 \
#     --enable-private-ip-google-access \
#     --region={REGION_WEST} \
#     --project={PROJECT_ID_VPC_HOST}

In [None]:
!gcloud compute networks subnets list --project={PROJECT_ID_VPC_HOST}

## 2-3. Create Peering Range

In [None]:
NETWORK_NAME = f"projects/{PROJECT_NUMBER_VPC_HOST}/global/networks/{VPC_NAME}"
print(NETWORK_NAME)

In [None]:
# This is for display only; you can name the range anything.
PEERING_RANGE_NAME="google-reserved-range"

# NOTE: `prefix-length=16` means a CIDR block with mask /16 will be
# reserved for use by Google services, such as Vertex AI.
# /24, /19 did not work for Vertex AI Training
!gcloud compute addresses create {PEERING_RANGE_NAME} \
  --global \
  --prefix-length=16 \
  --description="peering range for Google service" \
  --network={NETWORK_NAME} \
  --purpose=VPC_PEERING \
  --project={PROJECT_ID_VPC_HOST}

In [None]:
# Create the VPC connection.
!gcloud services vpc-peerings connect \
  --service=servicenetworking.googleapis.com \
  --network={VPC_NAME} \
  --ranges={PEERING_RANGE_NAME} \
  --project={PROJECT_ID_VPC_HOST}

In [None]:
!gcloud compute networks peerings list --network {VPC_NAME} --project {PROJECT_ID_VPC_HOST}

## 2-4. Export Custom Routs [Optional]

If you use [custom routes](https://cloud.google.com/vertex-ai/docs/general/vpc-peering#export-custom-routes), you need to export them so that Vertex AI can import them. If you don't use custom routes, skip this section.

In [None]:
# !gcloud compute networks peerings update {PEERING_RANGE_NAME} \
#     --network={NETWORK_NAME} \
#     --export-custom-routes \
#     --project={PROJECT_ID}

## 2-4. Set up Shared VPC [Optional]

This section is not covered in this tutorial, to set up the Shared VPC follow [this](https://cloud.google.com/vpc/docs/provisioning-shared-vpc#set-up-shared-vpc) guide.

If you use [Shared VPC](https://cloud.google.com/vpc/docs/shared-vpc), you usually use Vertex AI in a separate Google Cloud project than your VPC host project. Learn how to [provision Shared VPC](https://cloud.google.com/vpc/docs/provisioning-shared-vpc).

# 3-0. Run custom training jobs on the Peered VPC

## 3-1. Prepare training jobs

Vertex AI Training supports submiting custom training jobs with a prebuilt container, custom container and python application via **HTTP request, Vertex AI SDK or gcloud CLI**. Learn more [here](https://cloud.google.com/vertex-ai/docs/training/code-requirements).

In this example, we will demonstrate how to run a custom job with with custom containers. Please specify the images below to your custom images.
Note, if it's not a public image, please ensure it's already pushed to your project.

https://cloud.google.com/vertex-ai/docs/training/containers-overview

In [None]:
# Repo to create / use for running training job
PRIVATE_REPO = "test-training"
TRAIN_IMAGE = (
    f"{REGION}-docker.pkg.dev/{PROJECT_ID}/{PRIVATE_REPO}/test:latest"
)
print("Private Repo:", PRIVATE_REPO)
print("Training Container Image:", TRAIN_IMAGE)

In [None]:
PROJECT_NUMBER=!(gcloud projects describe $PROJECT_ID --format="value(projectNumber)")
PROJECT_NUMBER = PROJECT_NUMBER[0]

In [None]:
PROJECT_NUMBER_VPC_HOST=!(gcloud projects describe $PROJECT_ID_VPC_HOST --format="value(projectNumber)")
PROJECT_NUMBER_VPC_HOST = PROJECT_NUMBER_VPC_HOST[0]

In [None]:
NETWORK_NAME = f"projects/{PROJECT_NUMBER_VPC_HOST}/global/networks/{VPC_NAME}"
print(NETWORK_NAME)

In [None]:
# Prepare training images
CPU_IMAGE = TRAIN_IMAGE
print("Using image: ", CPU_IMAGE)
print("Network Name: ", NETWORK_NAME)

## 3-2. Training Job Config

In [None]:
CPU_MACHINE_TYPE = "n2-standard-4"  # @param {type:"string"}
TRAINING_SA = f"{PROJECT_NUMBER}-compute@developer.gserviceaccount.com"

In [None]:
# ----------- Set WorkerPool Spec -----------#
# https://cloud.google.com/sdk/gcloud/reference/beta/ai/custom-jobs/create
CONTENT = f"""
workerPoolSpecs:
  - machineSpec:
      machineType: {CPU_MACHINE_TYPE}
    replicaCount: 1
    containerSpec:
      imageUri: {CPU_IMAGE}
      args:
        --sleep=3600s
serviceAccount: {TRAINING_SA}
network: {NETWORK_NAME}
"""

In [None]:
f = open("custom_job_spec.yaml", "w")
f.write(CONTENT[1:])
f.close()

!cat custom_job_spec.yaml

## 3-3. Create CPU test job on Vertex AI Training - us-central1

In [None]:
#----------- Create CPU Test Job -----------#
!gcloud beta ai custom-jobs create \
    --project={PROJECT_ID} \
    --region={REGION_CENTRAL} \
    --display-name="CPU Test Job Peering" \
    --config=custom_job_spec.yaml \
    --enable-web-access \
    --labels network_type=vpc_peering

## 3-4. Create CPU test job on Vertex AI Training - us-west1

In [None]:
#----------- Create CPU Test Job -----------#
!gcloud beta ai custom-jobs create \
    --project={PROJECT_ID} \
    --region={REGION_WEST} \
    --display-name="CPU Test Job Peering" \
    --config=custom_job_spec.yaml \
    --enable-web-access \
    --labels network_type=vpc_peering

## 3-5. Monitor and debug training with an interactive shell

The jobs in this project have [enabled interactive shells](https://cloud.google.com/vertex-ai/docs/training/monitor-debug-interactive-shell) for the custom training resource. The interactive shell allows you to inspect the container where your training code is running.

You can navitage to the interactive shell with [these](https://cloud.google.com/vertex-ai/docs/training/monitor-debug-interactive-shell#navigate-console) instructions.

# 4-0. Get Job Details

In [None]:
# Option 1: Use the Custom Job ID to get details
# JOB_ID = "" # @param {type:"string"}
# !gcloud beta ai custom-jobs describe {JOB_ID} --project={PROJECT_ID} --region={REGION}

In [None]:
# Option 2: List existing custom jobs, filter running jobs and ones with the set label
# Lists the existing custom jobs, filters with the label set for these jobs
FILTER = '"(state!="JOB_STATE_SUCCEEDED" AND state!="JOB_STATE_FAILED" AND state!="JOB_STATE_CANCELLED") AND labels.network_type=vpc_peering"'
!gcloud beta ai custom-jobs list --project={PROJECT_ID} --region={REGION_CENTRAL} --filter={FILTER}

In [None]:
!gcloud beta ai custom-jobs list --project={PROJECT_ID} --region={REGION_WEST} --filter={FILTER}

# 5-0. Cleaning up

To clean up all Google Cloud resources used in this project, you can [delete the Google Cloud
project](https://cloud.google.com/resource-manager/docs/creating-managing-projects#shutting_down_projects) you used for the tutorial.

Otherwise, you can delete the individual resources you created in this tutorial:

- Artifacts Repository
- VPC Network

In [None]:
# Set this to true only if you'd like to delete your bucket
# delete_bucket = False
delete_artifacts_repo = False

# No bucket used in this example
# if delete_bucket:
#   !gsutil rm -rf $JOB_DIR
#   !gsutil rm -r $BUCKET_URI

if delete_artifacts_repo:
  !gcloud artifacts repositories delete {PRIVATE_REPO} --project={PROJECT_ID} --location={REGION} --quiet