In [None]:
!pip install -Uq huggingface-hub

In [None]:
#For downloading model to private storage. Use'll use cloud storage as model repository. Tuned model should have uploaded to cloud storage

from huggingface_hub import snapshot_download
import os
hf_token = ""
hf_repo_id = "Qwen/Qwen2.5-VL-3B-Instruct"
BASE_ARTIFACT_URI = "gs://qwiklabs-gcp-02-a53829e24ea1-model"
os.system(f"rm -rf /tmp/model")
print("Start downloading")
snapshot_download(repo_id=hf_repo_id, token=hf_token, local_dir=f"/tmp/model")
print("Uploading")
os.system(f"gcloud storage cp /tmp/model/*.* {BASE_ARTIFACT_URI}/{hf_repo_id}")
print("Done")

In [None]:
#Basic configuration
import os
import logging
import json
import urllib3
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
logging.basicConfig(level=logging.INFO)
LOCATION = "asia-northeast1"
vLLM_DOCKER_URI = "us-docker.pkg.dev/vertex-ai/vertex-vision-model-garden-dockers/pytorch-vllm-serve:20260130_0916_RC01"
DEPLOY_TIMEOUT = 3600
SERVICE_ACCOUNT = "qwiklabs-gcp-02-a53829e24ea1@qwiklabs-gcp-02-a53829e24ea1.iam.gserviceaccount.com"   #SA to access gcs

In [None]:
#Model configuration
MODEL_NAME_V1 = "Qwen2.5-VL-3B-Instruct GPU"
MODEL_PATH_V1 = f"{BASE_ARTIFACT_URI}/{hf_repo_id}"
MODEL_ID = "/".join(MODEL_PATH_V1.split("/")[-2:])
TENSOR_PARALLEL_SIZE = 1
MAX_MODEL_LEN = 65536
MACHINE_TYPE = f"g2-standard-4" #a2-highgpu-4g, g2-standard-48, a3-edgegpu-8g
TPU_TOPOLOGY = None
ACCELERATOR_TYPE = "NVIDIA_L4" #NVIDIA_TESLA_A100, NVIDIA_L4, NVIDIA_H100_80GB
ACCELERATOR_COUNT = TENSOR_PARALLEL_SIZE

#MODEL_NAME_V1 = "Llama-3.1-8B"
#MODEL_PATH_V1 = "gs://jk-model-repo/meta-llama/Llama-3.1-8B"
#TENSOR_PARALLEL_SIZE = 1
#MAX_MODEL_LEN = 32768
#MACHINE_TYPE = "a2-highgpu-1g"
#ACCELERATOR_TYPE = "NVIDIA_TESLA_A100"

In [None]:
#Upload model pacakge to vertex model repository
from google.cloud import aiplatform
model_v1 = aiplatform.Model.upload(
    location=LOCATION,
    display_name = MODEL_NAME_V1,
    #local_model = local_model_v1,
    artifact_uri = MODEL_PATH_V1,
    #parent_model = prev_model.resource_name,
    #is_default_version=True,
    #serving_container_environment_variables={
    #    "VERTEX_CPR_MAX_WORKERS": "1",
    #    "PORT": "8080", #server runs on 5000, or 8080 by dafault
    #    "RUST_BACKTRACE": "full", #for stack trace printing,
    #},
    #serving_container_ports=[8080],
    #serving_container_args = ["--num-shard 1"]
    
    #For direct container upload rather then use LocalModel
    serving_container_image_uri=vLLM_DOCKER_URI,
    serving_container_invoke_route_prefix="/*",
    #serving_container_predict_route="/v1/chat/completions",
    serving_container_health_route="/health",
    serving_container_ports=[8080],
    serving_container_environment_variables={
        #"VLLM_ALLOW_LONG_MAX_MODEL_LEN": "1",
        #"VLLM_USE_V1": "1",
        #"HF_TOKEN": hf_token
        "MODEL_ID": MODEL_ID,
        "DEPLOY_SOURCE": "API_NATIVE_MODEL"
    },
    serving_container_args=["python",
                            "-m",
                            "vllm.entrypoints.api_server",
                            #f"--download-dir={BASE_PATH}",
                            #f"--model={MODEL_PATH_V1}",                                                
                            "--port=8080",                                               
                            f"--tensor-parallel-size={TENSOR_PARALLEL_SIZE}",
                            "--enable-prefix-caching",
                            "--enable-chunked-prefill",
                            f"--max-model-len={MAX_MODEL_LEN}",
                            "--served-model-name=openapi",
                            #"--enable-request-id-headers",   #enable to track id from response header
                            "--disable_chunked_mm_input",
                            "--limit-mm-per-prompt.image=0",
                            "--limit-mm-per-prompt.video=1",
                            "--max-num-seqs=1",
                            "--max-num-batched-tokens=64K",
                            "--allowed-local-media-path=/gcs/jk-mount-test",
                            f"--model={MODEL_ID}",
                            #"--gpu-memory-utilization=0.9",
                            #"--swap-space=16",
                            #"--max-num-batched-tokens=512",
                            #"--enforce-eager",  #Reduce memory but also slow
                            #"--max-num-seqs=128", #Temporal for testing
                            ],
    serving_container_shared_memory_size_mb=(16 * 1024),  # 16 GB
)

In [None]:
from google.cloud import aiplatform
endpoint = aiplatform.Endpoint.create(
    display_name=f"{MODEL_NAME_V1} public dedicated test endpoint",
    #labels={"sample-key": "sample-value"},
    location=LOCATION,
    dedicated_endpoint_enabled=True,
)

In [None]:
#Public endpoint with dedicated network
response = endpoint.deploy(
    model = model_v1,
    machine_type=MACHINE_TYPE,
    tpu_topology=TPU_TOPOLOGY,    
    min_replica_count=1,
    max_replica_count=1,
    service_account=SERVICE_ACCOUNT,
    #traffic_percentage=50
    #traffic_split={'a':50, 'b':50}
    #Configs for GPU
    #accelerator_type="NVIDIA_L4",
    #machine_type="a2-highgpu-1g",
    accelerator_type=ACCELERATOR_TYPE,
    accelerator_count=ACCELERATOR_COUNT,
    deploy_request_timeout=DEPLOY_TIMEOUT
)

In [None]:
#Sample test
from pydantic import BaseModel
from enum import Enum

class CarType(str, Enum):
    sedan = "sedan"
    suv = "SUV"
    truck = "Truck"
    coupe = "Coupe"

class CarDescription(BaseModel):
    brand: str
    model: str
    car_type: CarType

prediction_input = {
    "stream": False,
    "chat_template_kwargs": {"enable_thinking": False},
    "messages": [
        {
            "role": "user",
            "content": "Generate a JSON with the brand, model and car_type of the most iconic car from the 90's"
        }
    ],
    "guided_json": CarDescription.model_json_schema()
}

In [None]:
#Public and dedicated endpoint predict
from google.cloud import aiplatform
#ENDPOINT_ID = "4304072351789613056"
#endpoint = aiplatform.Endpoint(ENDPOINT_ID, location=LOCATION)
headers = {
    "Content-Type": "application/json",
    #"x-request-id": "ebb94475-1ca2-4e4b-baa3-8d039c0e616e", #works when --enable-request-id-headers option enabled
    "x-vertex-ai-timeout-ms": "60000"
}
response = endpoint.invoke(request_path="/v1/chat/completions", headers=headers, body=json.dumps(prediction_input, indent=2).encode('utf-8'))
print(response.headers)
print(response)
print(response.json()['choices'][0]['message']['content'])

In [None]:
payload = {
    "stream": False,
    "chat_template_kwargs": {"enable_thinking": False},
    "messages": [
        {
            "role": "user",
            "content": [
                {
                    "type": "text", 
                    "text": "Describe the video content in detail."
                },
                {
                    "type": "video_url",
                    "video_url": {"url": "/gcs/jk-mount-test/free-videos.mp4"}
                    #"video_url": {"url": "http://commondatastorage.googleapis.com/gtv-videos-bucket/sample/ForBiggerFun.mp4"}
                },
            ]
        }
    ]
}
response = endpoint.invoke(request_path="/v1/chat/completions", headers=headers, body=json.dumps(payload, indent=2).encode('utf-8'))
print(response.headers)
print(response)
print(response.json()['choices'][0]['message']['content'])