In [None]:
from huggingface_hub import snapshot_download
import os
hf_token = "TOKEN"
hf_repo_id = "meta-llama/Llama-3.1-8B-Instruct"
BASE_ARTIFACT_URI = "gs://BUCKET"
os.system(f"rm -rf /tmp/model")
print("Start downloading")
snapshot_download(repo_id=hf_repo_id, token=hf_token, local_dir=f"/tmp/model")
print("Uploading")
os.system(f"gcloud storage cp /tmp/model/*.* {BASE_ARTIFACT_URI}/{hf_repo_id}")
print("Done")

In [None]:
import os
import logging
logging.basicConfig(level=logging.INFO)
#Add Pub/Sub publisher to this account for topic tgi
PROJECT_NUMBER = "NUMBER"
PROJECT_ID = "ID"
SERVICE_ACCOUNT = "SA"
MODEL_PATH = "gs://MODEL_PATH"
CUSTOM_CONTAINER = f"us-central1-docker.pkg.dev/{PROJECT_ID}/custom-inference-gpu/tgi-gcp"
#Check https://cloud.google.com/python/docs/reference/aiplatform/latest/google.cloud.aiplatform.prediction.LocalModel#google_cloud_aiplatform_prediction_LocalModel
#Refer https://huggingface.co/docs/text-generation-inference/en/reference/launcher  for serving_container_args
#Refer https://github.com/huggingface/Google-Cloud-Containers/blob/main/containers/tgi/gpu/2.4.0/entrypoint.sh to check entry point
#Refer https://cloud.google.com/vertex-ai/docs/predictions/custom-container-requirements#aip-variables for monitor variable

In [None]:
#CONTAINER = "us-docker.pkg.dev/deeplearning-platform-release/gcr.io/huggingface-text-generation-inference-cu124.2-4.ubuntu2204.py311"
CONTAINER = CUSTOM_CONTAINER
!docker build -t {CONTAINER} .
MODEL_NAME = "Llama-3.1-8B-Instruct"

#must secure sufficient space
from google.cloud.aiplatform.prediction import LocalModel
local_model = LocalModel(serving_container_image_uri=CONTAINER,
                         serving_container_environment_variables={
                             "VERTEX_CPR_MAX_WORKERS": "1",
                             "RUST_BACKTRACE": "full", #for stack trace printing,
                             #"PORT": "5000", #server runs on 5000, or 8080 by dafault
                         },
                         #serving_container_ports=[5000], #expose container port, system map is random
                         serving_container_health_route="/aiphealth", #need to add bypass conflict only for localmodel
                         serving_container_args=["--num-shard 1"], #We can use both serving_container_environment_variables and serving_container_args
                        )

#TGI container handles /generate as input handler
prediction_handler = "/generate"
prediction_input = {
    "inputs": "What is machine learning?",
    "parameters": {
        "max_new_tokens": 256,
        "repetition_penalty": 1.2
    }
}

In [None]:
CONTAINER = "us-docker.pkg.dev/deeplearning-platform-release/vertex-model-garden/vllm-inference.cu121.0-5.ubuntu2204.py310"
MODEL_NAME = "Llama-3.1-8B-Instruct-vllm-prebuilt-container"

#must secure sufficient space
from google.cloud.aiplatform.prediction import LocalModel
local_model = LocalModel(serving_container_image_uri=CONTAINER,
                        serving_container_ports=[8080], #expose container port, system map is random
                        serving_container_health_route="/ping",
                        serving_container_predict_route="/generate",
                        serving_container_args=["python",
                                               "-m",
                                               "vllm.entrypoints.api_server",
                                               "--host=0.0.0.0",
                                               "--port=8080",
                                               "--gpu-memory-utilization=0.9",
                                               "--max-model-len=16384"]
                        )

#Default input handler
prediction_handler = "/predict"
prediction_input = {
    "instances" : [
        {
          "prompt": "What is machine learning?"
        }
    ],
    "parameters": {
        "max_new_tokens": 256,
        "repetition_penalty": 1.2
    }
}

In [None]:
local_model.get_serving_container_spec()

In [None]:
Run conainer locally to check base output
#docker run -it -p 5000:5000 -e AIP_HTTP_PORT=5000 -e AIP_STORAGE_URI=gs://jk-model-repo/gemma1/gemma-2b-it-test --gpus=all us-docker.pkg.dev/deeplearning-platform-release/gcr.io/huggingface-text-generation-inference-cu124.2-4.ubuntu2204.py311

In [None]:
#Test plain endpoint
import json
import time
with local_model.deploy_to_local_endpoint(
    artifact_uri=MODEL_PATH,
    gpu_count=-1,
    container_ready_timeout = 600
) as local_endpoint:
    health_check_response = local_endpoint.run_health_check()
    local_endpoint.serving_container_predict_route = prediction_handler
    predict_response = local_endpoint.predict(
        request=json.dumps(prediction_input),
        headers={"Content-Type": "application/json"},
    )    
    time.sleep(30)
    local_endpoint.print_container_logs()
print(health_check_response, health_check_response.content)
print(predict_response.text)

In [None]:
#Run if needed
!gcloud auth configure-docker us-central1-docker.pkg.dev --quiet
local_model.push_image()

In [None]:
from google.cloud import aiplatform
endpoint = aiplatform.Endpoint.create(
    display_name=f"{MODEL_NAME} test endpoint",
    labels={"sample-key": "sample-value"},
)

In [None]:
model = aiplatform.Model.upload(
    display_name = MODEL_NAME,
    local_model = local_model,
    artifact_uri = MODEL_PATH,
    #parent_model = prev_model.resource_name,
    #is_default_version=True,
    #serving_container_environment_variables={
    #    "VERTEX_CPR_MAX_WORKERS": "1",
    #    "PORT": "8080", #server runs on 5000, or 8080 by dafault
    #    "RUST_BACKTRACE": "full", #for stack trace printing,
    #},
    #serving_container_ports=[8080],
    #serving_container_args = ["--num-shard 1"]
)

In [None]:
endpoint.deploy(
    model = model,
    machine_type="g2-standard-4",
    accelerator_type="NVIDIA_L4",
    #machine_type="a2-highgpu-1g",
    #accelerator_type="NVIDIA_TESLA_A100",
    accelerator_count=1,
    min_replica_count=1,
    max_replica_count=1,
    service_account=SERVICE_ACCOUNT
    #traffic_percentage=50
    #traffic_split={'a':50, 'b':50}
)

In [None]:
#Run this if start from endpoint id
ENDPOINT_ID = "00000000000"
from google.cloud import aiplatform
endpoint = aiplatform.Endpoint(ENDPOINT_ID)

In [None]:
output = endpoint.predict(
    instances=[
        {
            "inputs": "<bos><start_of_turn>user\nWhat's Deep Learning?<end_of_turn>\n<start_of_turn>model\n",
            "parameters": {
                "max_new_tokens": 128,
                "do_sample": True,
                "top_p": 0.95,
                "temperature": 0.7,
            },
        },
    ],
)
print(output.predictions[0])

In [None]:
#https://cloud.google.com/vertex-ai/generative-ai/docs/multimodal/call-vertex-using-openai-library
#https://cloud.google.com/deep-learning-containers/docs/choosing-container#vLLM-inference

In [None]:
import google.auth
import requests
creds, project = google.auth.default()
auth_req = google.auth.transport.requests.Request()
creds.refresh(auth_req)
url = f"https://us-central1-aiplatform.googleapis.com/v1/projects/{PROJECT_NUMBER}/locations/us-central1/endpoints/{ENDPOINT_ID}:predict"
headers = {'Authorization': f'Bearer {creds.token}'}
payload = {
    "instances": [
        {
            "inputs": "Hello!",
            "parameters": {
                "max_new_tokens": 128
            }
        }
    ]
}
response = requests.post(url, json=payload, headers=headers).json()
response

In [None]:
#TODO: Chat and streaming failure needs to be checked on TGI container, vllm container worked
import google.auth
import requests
creds, project = google.auth.default()
auth_req = google.auth.transport.requests.Request()
creds.refresh(auth_req)
url = f"https://us-central1-aiplatform.googleapis.com/v1/projects/{PROJECT_NUMBER}/locations/us-central1/endpoints/{ENDPOINT_ID}/chat/completions"
headers = {'Authorization': f'Bearer {creds.token}'}
payload = {
    "stream": True,
    "messages": [
        {
            "role": "user",
            "content": "Write a story about a magic backpack."
        }
    ]
}
response = requests.post(url, json=payload, headers=headers).json()
response