In [None]:
!pip install huggingface-hub

In [None]:
#For downloading model to private storage. Use'll use cloud storage as model repository. Tuned model should have uploaded to cloud storage

from huggingface_hub import snapshot_download
import os
hf_token = ""
hf_repo_id = "google/gemma-3-4b-it"
BASE_ARTIFACT_URI = "gs://qwiklabs-asl-01-e0b9ada3eb75-model"
os.system(f"rm -rf /tmp/model")
print("Start downloading")
snapshot_download(repo_id=hf_repo_id, token=hf_token, local_dir=f"/tmp/model")
print("Uploading")
os.system(f"gcloud storage cp /tmp/model/*.* {BASE_ARTIFACT_URI}/{hf_repo_id}")
print("Done")

In [None]:
#Basic configuration
import os
import logging
import json
import urllib3
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
logging.basicConfig(level=logging.INFO)
LOCATION = "europe-west4" # https://docs.cloud.google.com/vertex-ai/docs/general/locations#region_considerations
vLLM_DOCKER_URI = "us-docker.pkg.dev/deeplearning-platform-release/vertex-model-garden/vllm-inference-tpu.0-11.ubuntu2204.py312:model-garden.vllm-tpu-release_20260120.00_p0"
DEPLOY_TIMEOUT = 3600
SERVICE_ACCOUNT = "qwiklabs-asl-01-e0b9ada3eb75@qwiklabs-asl-01-e0b9ada3eb75.iam.gserviceaccount.com"   #SA to access gcs

In [None]:
#Model configuration
MODEL_NAME_V1 = "gemma-3-4b-it TPU"
MODEL_PATH_V1 = f"{BASE_ARTIFACT_URI}/{hf_repo_id}"
MODEL_ID = "/".join(MODEL_PATH_V1.split("/")[-2:])
TENSOR_PARALLEL_SIZE = 1
MAX_MODEL_LEN = 49152
MACHINE_TYPE = f"ct6e-standard-{TENSOR_PARALLEL_SIZE}t" # ct6e-standard-8t, ct5lp-hightpu-8t
TPU_TOPOLOGY = None #Omit the topology to host on single machine, None, 2x2, 2x4

In [None]:
#Upload model pacakge to vertex model repository
from google.cloud import aiplatform
model_v1 = aiplatform.Model.upload(
    location=LOCATION,
    display_name = MODEL_NAME_V1,
    #local_model = local_model_v1,
    artifact_uri = MODEL_PATH_V1,
    #parent_model = prev_model.resource_name,
    #is_default_version=True,
    #serving_container_environment_variables={
    #    "VERTEX_CPR_MAX_WORKERS": "1",
    #    "PORT": "8080", #server runs on 5000, or 8080 by dafault
    #    "RUST_BACKTRACE": "full", #for stack trace printing,
    #},
    #serving_container_ports=[8080],
    #serving_container_args = ["--num-shard 1"]
    
    #For direct container upload rather then use LocalModel
    serving_container_image_uri=vLLM_DOCKER_URI,
    serving_container_invoke_route_prefix="/*",
    #serving_container_predict_route="/v1/chat/completions",
    serving_container_health_route="/health",
    serving_container_ports=[8080],
    serving_container_environment_variables={
        #"VLLM_ALLOW_LONG_MAX_MODEL_LEN": "1",
        #"VLLM_USE_V1": "1",
        #"HF_TOKEN": hf_token,
        "MODEL_ID": MODEL_ID,
        "DEPLOY_SOURCE": "API_NATIVE_MODEL"
    },
    serving_container_args=["python",
                            "-m",
                            "vllm.entrypoints.openai.api_server",
                            "--port=8080",
                            f"--tensor-parallel-size={TENSOR_PARALLEL_SIZE}",
                            "--enable-prefix-caching",
                            "--enable-chunked-prefill",
                            f"--max-model-len={MAX_MODEL_LEN}",
                            "--served-model-name=openapi",
                            #"--enable-request-id-headers",   #enable to track id from response header
                            "--disable_chunked_mm_input",
                            "--limit-mm-per-prompt.image=1",
                            "--limit-mm-per-prompt.video=0",
                            #"--max-num-seqs=1",
                            "--max-num-batched-tokens=48K",
                            f"--model={MODEL_ID}",
                            #"--max-num-seqs=128",
                            ],
    serving_container_shared_memory_size_mb=(16 * 1024),  # 16 GB
)

In [None]:
from google.cloud import aiplatform
endpoint = aiplatform.Endpoint.create(
    display_name=f"{MODEL_NAME_V1} public dedicated test endpoint",
    #labels={"sample-key": "sample-value"},
    location=LOCATION,
    dedicated_endpoint_enabled=True,
)

In [None]:
endpoint.deploy(
    model = model_v1,
    machine_type=MACHINE_TYPE,
    tpu_topology=TPU_TOPOLOGY,
    min_replica_count=1,
    max_replica_count=1,
    service_account=SERVICE_ACCOUNT,
    #traffic_percentage=50
    #traffic_split={'a':50, 'b':50}
    #Configs for GPU
    #accelerator_type="NVIDIA_L4",
    #machine_type="a2-highgpu-1g",
    #accelerator_type=ACCELERATOR_TYPE,
    #accelerator_count=ACCELERATOR_COUNT,
    deploy_request_timeout=DEPLOY_TIMEOUT
)

In [None]:
#Sample test
from pydantic import BaseModel
from enum import Enum

class CarType(str, Enum):
    sedan = "sedan"
    suv = "SUV"
    truck = "Truck"
    coupe = "Coupe"

class CarDescription(BaseModel):
    brand: str
    model: str
    car_type: CarType

prediction_input = {
    "stream": False,
    "chat_template_kwargs": {"enable_thinking": False},
    "messages": [
        {
            "role": "user",
            "content": "Generate a JSON with the brand, model and car_type of the most iconic car from the 90's"
        }
    ],
    "structured_outputs": {"json": CarDescription.model_json_schema()}
}

In [None]:
#Public and dedicated endpoint predict
from google.cloud import aiplatform
#ENDPOINT_ID = "4304072351789613056"
#endpoint = aiplatform.Endpoint(ENDPOINT_ID, location=LOCATION)
headers = {
    "Content-Type": "application/json",
    #"x-request-id": "ebb94475-1ca2-4e4b-baa3-8d039c0e616e", #works when --enable-request-id-headers option enabled
    "x-vertex-ai-timeout-ms": "60000"
}
response = endpoint.invoke(request_path="/v1/chat/completions", headers=headers, body=json.dumps(prediction_input, indent=2).encode('utf-8'))
print(response.headers)
print(response)
print(response.json()['choices'][0]['message']['content'])

In [None]:
payload = {
    "stream": False,
    "chat_template_kwargs": {"enable_thinking": False},
    "messages": [
        {
            "role": "user",
            "content": [
                {
                    "type": "image_url",
                    "image_url": {"url": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/p-blog/candy.JPG"}
                },
                {
                    "type": "text", 
                    "text": "What animal is on the candy?"
                },
            ]
        }
    ]
}
response = endpoint.invoke(request_path="/v1/chat/completions", headers=headers, body=json.dumps(payload, indent=2).encode('utf-8'))
print(response.headers)
print(response)
print(response.json()['choices'][0]['message']['content'])