In [None]:
# Copyright 2025 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Vertex AI Training DNS Peering Testing

<table align="left">
  <td style="text-align: center">
    <a href="https://colab.research.google.com/github/jbrache/vertex-ai-things/blob/main/MLOps/Training/DNS_Peering/Vertex_Training_DNS_Peering_Testing.ipynb">
      <img width="32px" src="https://www.gstatic.com/pantheon/images/bigquery/welcome_page/colab-logo.svg" alt="Google Colaboratory logo"><br> Open in Colab
    </a>
  </td>
  <td style="text-align: center">
    <a href="https://console.cloud.google.com/vertex-ai/colab/import/https:%2F%2Fraw.githubusercontent.com%2Fjbrache%2Fvertex-ai-things%2Fmain%2FMLOps%2FTraining%2FDNS_Peering%2FVertex_Training_DNS_Peering_Testing.ipynb">
      <img width="32px" src="https://lh3.googleusercontent.com/JmcxdQi-qOpctIvWKgPtrzZdJJK-J3sWE1RsfjZNwshCFgE_9fULcNpuXYTilIR2hjwN" alt="Google Cloud Colab Enterprise logo"><br> Open in Colab Enterprise
    </a>
  </td>
  <td style="text-align: center">
    <a href="https://console.cloud.google.com/vertex-ai/workbench/deploy-notebook?download_url=https://raw.githubusercontent.com/jbrache/vertex-ai-things/blob/main/MLOps/Training/DNS_Peering/Vertex_Training_DNS_Peering_Testing.ipynb">
      <img src="https://www.gstatic.com/images/branding/gcpiconscolors/vertexai/v1/32px.svg" alt="Vertex AI logo"><br> Open in Vertex AI Workbench
    </a>
  </td>
  <td style="text-align: center">
    <a href="https://github.com/jbrache/vertex-ai-things/blob/main/MLOps/Training/DNS_Peering/Vertex_Training_DNS_Testing.ipynb">
      <img width="32px" src="https://www.svgrepo.com/download/217753/github.svg" alt="GitHub logo"><br> View on GitHub
    </a>
  </td>
</table>

| Author(s) |
| --- |
| [Jose Brache](https://github.com/jbrache) |

## Overview

You can configure [Vertex AI to peer with Virtual Private Cloud](https://cloud.google.com/vertex-ai/docs/general/vpc-peering) (VPC) to connect directly with certain resources in Vertex AI.

This guide shows how to test [**DNS Peering**](https://cloud.google.com/blog/products/networking/how-to-use-cloud-dns-peering-in-a-shared-vpc-environment) with Vertex AI Training. A pre-requisite is setting up VPC Network Peering in a VPC Host Project to peer your network with Vertex AI Training. This guide is recommended for networking administrators who are already familiar with Google Cloud networking concepts.

Review the public docs for the latest information on support:
- [Set up VPC Network Peering for certain Vertex AI resources](https://cloud.google.com/vertex-ai/docs/general/vpc-peering)
- [Set up Connectivity from Vertex AI to Other Networks](https://cloud.google.com/vertex-ai/docs/general/hybrid-connectivity)

This example covers the following steps:
1. Pre-Requisite: Setup VPC Network Peering with Vertex AI Training, see [this](https://github.com/jbrache/vertex-ai-things/blob/main/MLOps/Training/Vertex_Training_with_VPC_Network_Peering.ipynb) sample.
2. Setup projects as the **VPC Host Project (Project 'a')**
4. Setup VPC Peering in VPC Host Project for Vertex AI Training
5. Prepare Training Job
6. Submit Training Job with VPC Network Peering in **us-central1**
7. Submit Training Job with VPC Network Peering in **us-west1**
8. Clean Up

## Get started

### Install Vertex AI SDK and other required packages

In [7]:
# import sys

# if "google.colab" in sys.modules:
#     !pip3 install --upgrade google-cloud-aiplatform

#     # Restart the notebook kernel after installs.
#     import IPython

#     app = IPython.Application.instance()
#     app.kernel.do_shutdown(True)

---

#### ⚠️ Do not forget to click the "RESTART RUNTIME" button above.

---

If you install additional packages, it's suggested to restart the notebook kernel so it can find the packages.

### Authenticate your notebook environment (Colab only)

If you're running this notebook on Google Colab, run the cell below to authenticate your environment.

In [8]:
import sys

if "google.colab" in sys.modules:
    from google.colab import auth

    auth.authenticate_user()

# 0-0. Set Google Cloud project information

To get started using Vertex AI, you must have an existing Google Cloud project and [enable the Vertex AI API](https://console.cloud.google.com/flows/enableapi?apiid=aiplatform.googleapis.com).

Learn more about [setting up a project and a development environment](https://cloud.google.com/vertex-ai/docs/start/cloud-environment).

In [9]:
############## Set Region for Shared Resources ###########################
REGION = "us-central1"  # @param {type: "string"}

############## Set VPC Host Project ###########################
# Calling this Project 'a' where the VPC Host project exists
PROJECT_ID_VPC_HOST = "ds-dev-jb02-psci"  # @param {type:"string"}

# Region #1
REGION_CENTRAL = "us-central1"  # @param {type: "string"}
VPC_NAME = "vertex-vpc-prod"
SUBNET_NAME_CENTRAL = f"{REGION_CENTRAL}-vertex-psci"
NETWORK_ATTACHMENT_NAME_CENTRAL = f"{REGION_CENTRAL}-vertex-psci"

# Region #2
REGION_WEST = "us-west1"  # @param {type: "string"}
VPC_NAME = "vertex-vpc-prod"
SUBNET_NAME_WEST = f"{REGION_WEST}-vertex-psci"
NETWORK_ATTACHMENT_NAME_WEST = f"{REGION_WEST}-vertex-psci"

# DNS Peering Settings
DNS_VPC="vertex-dns-vpc	"
DNS_SUFFIX="myfakezone.com"

############## Set Vertex AI Project ###########################
# Calling this Project 'b' where Vertex AI Training jobs run
PROJECT_ID = "ds-dev-jb02-psci"  # @param {type:"string"}

############## Set Bucket ###########################
BUCKET_URI = f"gs://{PROJECT_ID}-artifacts"

# Set the project id
!gcloud config set project {PROJECT_ID}

Updated property [core/project].


## 0-1. Enable APIs
The following APIs are enabled in this demo:
1. [Enable the Vertex AI API](https://console.cloud.google.com/flows/enableapi?apiid=aiplatform.googleapis.com)
2. [Enable the Cloud Build API](https://console.cloud.google.com/flows/enableapi?apiid=cloudbuild.googleapis.com)
3. [Enable the Artifact Registry API](https://console.cloud.google.com/flows/enableapi?apiid=artifactregistry.googleapis.com): You must enable the Artifact Registry API for your project. You will store your custom training container in Artifact Registry. [Learn more about Enabling the Artifact Registry service](https://cloud.google.com/artifact-registry/docs/enable-service)
4. [Enable the Service Networking API](https://console.cloud.google.com/flows/enableapi?apiid=servicenetworking.googleapis.com)


In [None]:
############# Enable the APIs for Vertex AI Project ########################
!gcloud services enable --project=$PROJECT_ID aiplatform.googleapis.com artifactregistry.googleapis.com cloudbuild.googleapis.com --project $PROJECT_ID

In [None]:
############# Enable the APIs for VPC Host Project ########################
!gcloud services enable --project=$PROJECT_ID_VPC_HOST aiplatform.googleapis.com compute.googleapis.com servicenetworking.googleapis.com --project $PROJECT_ID_VPC_HOST

## 0-2. Import libraries

In [None]:
import os
import sys
import requests
import json

## 0-3. Create a Cloud Storage bucket

Create a storage bucket to store intermediate artifacts such as datasets.

**Only if your bucket doesn't already exist**: Run the following cell to create your Cloud Storage bucket.

In [None]:
create_bucket = False
if create_bucket:
    !gsutil mb -l {REGION} -p {PROJECT_ID} {BUCKET_URI}

# 2-0. Set up Separate VPC for DNS Peering Testing

Following this guide for [setting up a Private Service Connect interface for Vertex AI resources](https://cloud.google.com/vertex-ai/docs/general/vpc-psc-i-setup).

## 2-1. Create VPC Network

In [None]:
!gcloud compute networks create {DNS_VPC_NAME} \
    --project={PROJECT_ID_VPC_HOST} \
    --subnet-mode=custom

## 2-2. Create Private Zone

In [None]:
!gcloud dns managed-zones create my-private-zone \
    --project={PROJECT_ID_VPC_HOST} \
    --description="" \
    --dns-name={DNS_SUFFIX} \
    --visibility=private \
    --networks={DNS_VPC_NAME}

## 2-3. Create DNS Record

In [None]:
!gcloud dns record-sets create test.{DNS_SUFFIX} \
    --project={PROJECT_ID_VPC_HOST} \
    --zone=my-private-zone \
    --type=A \
    --ttl=300 \
    --rrdatas=1.2.3.4

## 2-3. Create DNS Managed Zone

In [None]:
!gcloud dns managed-zones create \
    peer-mydns-vpc-from-vertex-vpc \
    --project={PROJECT_ID_VPC_HOST} \
    --dns-name={DNS_SUFFIX} \
    --visibility="private" \
    --description="" \
    --networks={VPC_NAME} \
    --target-project={PROJECT_ID_VPC_HOST} \
    --target-network={DNS_VPC_NAME}


## 2-3. Create a Peered DNS Domain

In [None]:
!gcloud services peered-dns-domains create \
    --project={PROJECT_ID_VPC_HOST} \
    vertex-dns \
    --network={PROJECT_ID_VPC_HOST} \
    --dns-suffix={DNS_SUFFIX}


# 3-0. Run custom training jobs on the Peered VPC

Pre-Requisite: Setup VPC Network Peering with Vertex AI Training, see [this](https://github.com/jbrache/vertex-ai-things/blob/main/MLOps/Training/Vertex_Training_with_VPC_Network_Peering.ipynb) sample.

## 3-1. Prepare training jobs

Vertex AI Training supports submiting custom training jobs with a prebuilt container, custom container and python application via **HTTP request, Vertex AI SDK or gcloud CLI**. Learn more [here](https://cloud.google.com/vertex-ai/docs/training/code-requirements).

In this example, we will demonstrate how to run a custom job with with custom containers. Please specify the images below to your custom images.
Note, if it's not a public image, please ensure it's already pushed to your project.

https://cloud.google.com/vertex-ai/docs/training/containers-overview

In [None]:
# Repo to create / use for running training job
PRIVATE_REPO = "test-training"
TRAIN_IMAGE = (
    f"{REGION}-docker.pkg.dev/{PROJECT_ID}/{PRIVATE_REPO}/test:latest"
)
print("Private Repo:", PRIVATE_REPO)
print("Training Container Image:", TRAIN_IMAGE)

In [None]:
PROJECT_NUMBER=!(gcloud projects describe $PROJECT_ID --format="value(projectNumber)")
PROJECT_NUMBER = PROJECT_NUMBER[0]

In [None]:
PROJECT_NUMBER_VPC_HOST=!(gcloud projects describe $PROJECT_ID_VPC_HOST --format="value(projectNumber)")
PROJECT_NUMBER_VPC_HOST = PROJECT_NUMBER_VPC_HOST[0]

In [None]:
NETWORK_NAME = f"projects/{PROJECT_NUMBER_VPC_HOST}/global/networks/{VPC_NAME}"
BUCKET_URI = f"gs://{PROJECT_ID}-artifacts"
print(NETWORK_NAME)
print(BUCKET_URI)

In [None]:
# Prepare training images
CPU_IMAGE = TRAIN_IMAGE
JOB_DIR = BUCKET_URI
print("Using image: ", CPU_IMAGE)
print("Output Model Directory: ", JOB_DIR)
print("Network Name: ", NETWORK_NAME)

## 3-2. Training Job Config

In [None]:
CPU_MACHINE_TYPE = "n2-standard-4"  # @param {type:"string"}
TRAINING_SA = f"{PROJECT_NUMBER}-compute@developer.gserviceaccount.com"

In [None]:
# ----------- Set WorkerPool Spec -----------#
# https://cloud.google.com/sdk/gcloud/reference/beta/ai/custom-jobs/create
CONTENT = f"""
workerPoolSpecs:
  - machineSpec:
      machineType: {CPU_MACHINE_TYPE}
    replicaCount: 1
    containerSpec:
      imageUri: {CPU_IMAGE}
      args:
        --sleep=1200s
serviceAccount: {TRAINING_SA}
network: {NETWORK_NAME}
"""

In [None]:
f = open("custom_job_spec.yaml", "w")
f.write(CONTENT[1:])
f.close()

!cat custom_job_spec.yaml

## 3-3. Create CPU test job on Vertex AI Training - us-central1

In [None]:
#----------- Create CPU Test Job -----------#
!gcloud beta ai custom-jobs create \
    --project={PROJECT_ID} \
    --region={REGION_CENTRAL} \
    --display-name="CPU Test Job Peering" \
    --config=custom_job_spec.yaml \
    --enable-web-access \
    --labels network_type=vpc_peering

## 3-3. Create CPU test job on Vertex AI Training - us-west1

In [None]:
#----------- Create CPU Test Job -----------#
!gcloud beta ai custom-jobs create \
    --project={PROJECT_ID} \
    --region={REGION_WEST} \
    --display-name="CPU Test Job Peering" \
    --config=custom_job_spec.yaml \
    --enable-web-access \
    --labels network_type=vpc_peering

## 3-5. Monitor and debug training with an interactive shell

The jobs in this project have [enabled interactive shells](https://cloud.google.com/vertex-ai/docs/training/monitor-debug-interactive-shell) for the custom training resource. The interactive shell allows you to inspect the container where your training code is running.

You can navitage to the interactive shell with [these](https://cloud.google.com/vertex-ai/docs/training/monitor-debug-interactive-shell#navigate-console) instructions.

In [None]:
# In the interactive shell run:
# dig myfakezone.com

# 4-0. Get Job Details

In [None]:
# Option 1: Use the Custom Job ID to get details
# JOB_ID = "" # @param {type:"string"}
# !gcloud beta ai custom-jobs describe {JOB_ID} --project={PROJECT_ID} --region={REGION}

In [None]:
# Option 2: List existing custom jobs, filter running jobs and ones with the set label
# Lists the existing custom jobs, filters with the label set for these jobs
FILTER = '"(state!="JOB_STATE_SUCCEEDED" AND state!="JOB_STATE_FAILED" AND state!="JOB_STATE_CANCELLED") AND labels.network_type=vpc_peering"'
!gcloud beta ai custom-jobs list --project={PROJECT_ID} --region={REGION_CENTRAL} --filter={FILTER}

In [None]:
!gcloud beta ai custom-jobs list --project={PROJECT_ID} --region={REGION_WEST} --filter={FILTER}

# 5-0. Cleaning up

To clean up all Google Cloud resources used in this project, you can [delete the Google Cloud
project](https://cloud.google.com/resource-manager/docs/creating-managing-projects#shutting_down_projects) you used for the tutorial.

Otherwise, you can delete the individual resources you created in this tutorial:

- Cloud Storage Bucket
- Artifacts Repository
- VPC Network

In [None]:
# Set this to true only if you'd like to delete your bucket
delete_bucket = False
delete_artifacts_repo = False

if delete_bucket:
  !gsutil rm -rf $JOB_DIR
  !gsutil rm -r $BUCKET_URI

if delete_artifacts_repo:
  !gcloud artifacts repositories delete {PRIVATE_REPO} --project={PROJECT_ID} --location={REGION} --quiet