In [None]:
from huggingface_hub import snapshot_download
import os
hf_token = "----"
hf_repo_id = "meta-llama/Llama-3.1-8B-Instruct"
BASE_ARTIFACT_URI = "gs://----"
os.system(f"rm -rf /tmp/model")
print("Start downloading")
snapshot_download(repo_id=hf_repo_id, token=hf_token, local_dir=f"/tmp/model")
print("Uploading")
os.system(f"gcloud storage cp /tmp/model/*.* {BASE_ARTIFACT_URI}/{hf_repo_id}")
print("Done")

In [None]:
!docker rm -vf $(docker ps -aq)
!docker rmi -f $(docker images -aq)

In [None]:
import os
import logging
logging.basicConfig(level=logging.INFO)
PROJECT_NUMBER = "----"
PROJECT_ID = "----"
MODEL_PATH = "gs://----/meta-llama/Llama-3.1-8B-Instruct"
VPC_NETWORK = "----" #vpc network name to peering
#Check https://cloud.google.com/python/docs/reference/aiplatform/latest/google.cloud.aiplatform.prediction.LocalModel#google_cloud_aiplatform_prediction_LocalModel
#Refer https://huggingface.co/docs/text-generation-inference/en/reference/launcher  for serving_container_args
#Refer https://github.com/huggingface/Google-Cloud-Containers/blob/main/containers/tgi/gpu/2.4.0/entrypoint.sh to check entry point
#Refer https://cloud.google.com/vertex-ai/docs/predictions/custom-container-requirements#aip-variables for monitor variable

In [None]:
#To run TGI
#CONTAINER = "us-docker.pkg.dev/deeplearning-platform-release/gcr.io/huggingface-text-generation-inference-cu124.2-4.ubuntu2204.py311"
CONTAINER = f"us-central1-docker.pkg.dev/{PROJECT_ID}/custom-inference-gpu/tgi-release:latest"
!docker build -t {CONTAINER} .
MODEL_NAME = "Llama-3.1-8B-Instruct-TGI"

#must secure sufficient space
from google.cloud.aiplatform.prediction import LocalModel
local_model = LocalModel(serving_container_image_uri=CONTAINER,
                         serving_container_environment_variables={
                             "VERTEX_CPR_MAX_WORKERS": "1",
                             "RUST_BACKTRACE": "full", #for stack trace printing,
                             "CUDA_MEMORY_FRACTION": "0.93",
                             #"AIP_PREDICT_ROUTE": "/generate",
                             #"AIP_HEALTH_ROUTE": "/metrics"
                             #"MODEL_ID": f"meta-llama/{MODEL_NAME}"
                             #"PORT": "5000", #server runs on 5000, or 8080 by dafault
                         },
                         #serving_container_ports=[5000], #expose container port, system map is random
                         serving_container_health_route="/metrics",
                         serving_container_predict_route="/generate",
                         serving_container_args=["--num-shard 1"], #We can use both serving_container_environment_variables and serving_container_args
                        )

from pydantic import BaseModel, conint
from typing import List
class Animals(BaseModel):
    location: str
    activity: str
    animals_seen: conint(ge=1, le=5)  # Constrained integer type
    animals: List[str]
    
prompt = "convert to JSON: I saw a puppy a cat and a raccoon during my bike ride in the park"

prediction_input = {
    "inputs": prompt,
    "parameters": {
        "repetition_penalty": 1.3,
        "grammar": {
            "type": "json",
            "value": Animals.model_json_schema()
        }
    }
}

In [None]:
local_model.get_serving_container_spec()

In [None]:
import json
#Manual deploy and test
local_endpoint = local_model.deploy_to_local_endpoint(
    artifact_uri=MODEL_PATH,
    gpu_count=-1,
    container_ready_timeout = 600)
local_endpoint.serve()
local_endpoint.print_container_logs()

In [None]:
#Run some code here to test monitor
predict_response = local_endpoint.predict(
        request=json.dumps(prediction_input),
        headers={"Content-Type": "application/json"},
    )
print(predict_response.text)
local_endpoint.print_container_logs()

In [None]:
local_endpoint.run_health_check().text

In [None]:
local_endpoint.stop()
local_endpoint.print_container_logs()

In [None]:
#Run if needed for serving container update
!gcloud auth configure-docker us-central1-docker.pkg.dev --quiet
local_model.push_image()

In [None]:
from google.cloud import aiplatform
model = aiplatform.Model.upload(
    display_name = MODEL_NAME,
    local_model = local_model,
    artifact_uri = MODEL_PATH,
    #parent_model = prev_model.resource_name,
    #is_default_version=True,
    #serving_container_environment_variables={
    #    "VERTEX_CPR_MAX_WORKERS": "1",
    #    "PORT": "8080", #server runs on 5000, or 8080 by dafault
    #    "RUST_BACKTRACE": "full", #for stack trace printing,
    #},
    #serving_container_ports=[8080],
    #serving_container_args = ["--num-shard 1"]
)

In [None]:
#Public and dedicated endpoint
from google.cloud import aiplatform
endpoint = aiplatform.Endpoint.create(
    display_name=f"{MODEL_NAME} proxy public test endpoint",
    labels={"sample-key": "sample-value"},
    #dedicated_endpoint_enabled=True,
)
endpoint.deploy(
    model = model,
    machine_type="g2-standard-4",
    accelerator_type="NVIDIA_L4",
    #machine_type="a2-highgpu-1g",
    #accelerator_type="NVIDIA_TESLA_A100",
    accelerator_count=1,
    min_replica_count=1,
    max_replica_count=1,
    #service_account=SERVICE_ACCOUNT
    #traffic_percentage=50
    #traffic_split={'a':50, 'b':50}
)

In [None]:
#Public and dedicated endpoint predict
from google.cloud import aiplatform
#ENDPOINT_ID = "0000"
#endpoint = aiplatform.Endpoint(ENDPOINT_ID)
response = endpoint.raw_predict(body=json.dumps(prediction_input, indent=2).encode('utf-8'), headers={'Content-Type':'application/json'})
response.data

In [None]:
#Private endpoint
#Refer https://cloud.google.com/vertex-ai/docs/predictions/using-private-endpoints
from google.cloud import aiplatform
endpoint = aiplatform.PrivateEndpoint.create(
    display_name=f"{MODEL_NAME} proxy private test endpoint",
    network=f"projects/{PROJECT_NUMBER}/global/networks/{VPC_NETWORK}",
    labels={"sample-key": "sample-value"},
)
#C3, L4, TPU not allowed for private endpoint
#Refer https://cloud.google.com/vertex-ai/docs/training/configure-compute
endpoint.deploy(
    model = model,
    machine_type="a2-highgpu-1g",
    accelerator_type="NVIDIA_TESLA_A100",
    accelerator_count=1,
    min_replica_count=1,
    max_replica_count=1,
    #service_account=SERVICE_ACCOUNT
    #traffic_percentage=50
    #traffic_split={'a':50, 'b':50}
)

In [None]:
#Private endpoint with raw predict, TGI does not use instances= so use raw_predict
from google.cloud import aiplatform
#ENDPOINT_ID = "0000"
#endpoint = aiplatform.PrivateEndpoint(ENDPOINT_ID)
response = endpoint.raw_predict(body=json.dumps(prediction_input, indent=2).encode('utf-8'), headers={'Content-Type':'application/json'})
response.data

In [None]:
#Private endpoint health
import google.auth
import requests
creds, project = google.auth.default()
auth_req = google.auth.transport.requests.Request()
creds.refresh(auth_req)

headers = {'Authorization': f'Bearer {creds.token}'}
response = requests.get(endpoint.health_http_uri, headers=headers)
print(response.text)