In [None]:
from huggingface_hub import snapshot_download
import os
hf_token = "----"
hf_repo_id = "meta-llama/Llama-3.1-8B-Instruct"
BASE_ARTIFACT_URI = "gs://----"
os.system(f"rm -rf /tmp/model")
print("Start downloading")
snapshot_download(repo_id=hf_repo_id, token=hf_token, local_dir=f"/tmp/model")
print("Uploading")
os.system(f"gcloud storage cp /tmp/model/*.* {BASE_ARTIFACT_URI}/{hf_repo_id}")
print("Done")

In [None]:
!docker rm -vf $(docker ps -aq)
!docker rmi -f $(docker images -aq)

In [None]:
import os
import logging
import json
import urllib3
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
logging.basicConfig(level=logging.INFO)
LOCATION = "us-central1"
PROJECT_NUMBER = "----"
PROJECT_ID = "----"
SECOND_PROJECT_ID = "----"
MODEL_PATH_V1 = "gs://jk-model-repo/meta-llama/Llama-3.1-8B-Instruct"
MODEL_PATH_V2 = "gs://jk-model-repo/meta-llama/Llama-3.1-8B-Instruct"
#VPC_NETWORK = "globalnetwork" #vpc network name to peering
#Check https://cloud.google.com/python/docs/reference/aiplatform/latest/google.cloud.aiplatform.prediction.LocalModel#google_cloud_aiplatform_prediction_LocalModel
#Refer https://huggingface.co/docs/text-generation-inference/en/reference/launcher  for serving_container_args
#Refer https://github.com/huggingface/Google-Cloud-Containers/blob/main/containers/tgi/gpu/2.4.0/entrypoint.sh to check entry point
#Refer https://cloud.google.com/vertex-ai/docs/predictions/custom-container-requirements#aip-variables for monitor variable

In [None]:
#To run vllm
CONTAINER = f"us-central1-docker.pkg.dev/{PROJECT_ID}/custom-inference-gpu/vllm-release:latest"
!docker image tag us-docker.pkg.dev/deeplearning-platform-release/vertex-model-garden/vllm-inference.cu121.0-5.ubuntu2204.py310:latest {CONTAINER}
MODEL_NAME_V1 = "Llama-3.1-8B-Instruct-VLLM"

#must secure sufficient space
from google.cloud.aiplatform.prediction import LocalModel
local_model_v1 = LocalModel(serving_container_image_uri=CONTAINER,
                        serving_container_ports=[8080], #expose container port, system map is random
                        serving_container_health_route="/metrics",
                        serving_container_predict_route="/v1/chat/completions",
                        serving_container_args=["python",
                                               "-m",
                                               "vllm.entrypoints.openai.api_server",
                                               "--host=0.0.0.0",
                                               "--port=8080",
                                               "--gpu-memory-utilization=0.9",
                                               "--max-model-len=16384"]
                        )

from pydantic import BaseModel
from enum import Enum

class CarType(str, Enum):
    sedan = "sedan"
    suv = "SUV"
    truck = "Truck"
    coupe = "Coupe"

class CarDescription(BaseModel):
    brand: str
    model: str
    car_type: CarType

json_schema = CarDescription.model_json_schema()

prediction_input = {
    "model": "openapi",
    "messages": [
        {
            "role": "user",
            "content": "Generate a JSON with the brand, model and car_type of the most iconic car from the 90's"
        }
    ],
    "guided_json": json_schema
}

In [None]:
local_model_v1.get_serving_container_spec()

In [None]:
import json
#Manual deploy and test
local_endpoint = local_model_v1.deploy_to_local_endpoint(
    artifact_uri=MODEL_PATH_V1,
    gpu_count=-1,
    container_ready_timeout = 600)
local_endpoint.serve()
local_endpoint.print_container_logs()

In [None]:
#Run some code here to test monitor
predict_response = local_endpoint.predict(
        request=json.dumps(prediction_input),
        headers={"Content-Type": "application/json"},
    )
print(predict_response.text)
local_endpoint.print_container_logs()

In [None]:
local_endpoint.run_health_check().text

In [None]:
local_endpoint.stop()
local_endpoint.print_container_logs()

In [None]:
#Run if needed for serving container update
!gcloud auth configure-docker us-central1-docker.pkg.dev --quiet
local_model_v1.push_image()

In [None]:
from google.cloud import aiplatform
model_v1 = aiplatform.Model.upload(
    display_name = MODEL_NAME_V1,
    local_model = local_model_v1,
    artifact_uri = MODEL_PATH_V1,
    #parent_model = prev_model.resource_name,
    #is_default_version=True,
    #serving_container_environment_variables={
    #    "VERTEX_CPR_MAX_WORKERS": "1",
    #    "PORT": "8080", #server runs on 5000, or 8080 by dafault
    #    "RUST_BACKTRACE": "full", #for stack trace printing,
    #},
    #serving_container_ports=[8080],
    #serving_container_args = ["--num-shard 1"]
)

In [None]:
#Private endpoint with PSC
#Refer https://cloud.google.com/vertex-ai/docs/predictions/using-private-endpoints
from google.cloud import aiplatform
endpoint = aiplatform.PrivateEndpoint.create(
    display_name=f"{MODEL_NAME_V1} proxy private test endpoint",
    private_service_connect_config=aiplatform.PrivateEndpoint.PrivateServiceConnectConfig(
        project_allowlist=[PROJECT_ID, SECOND_PROJECT_ID],
    ),
    #network=f"projects/{PROJECT_NUMBER}/global/networks/{VPC_NETWORK}",
    labels={"sample-key": "sample-value"},
)
#C3, L4, TPU not allowed for private endpoint
#Refer https://cloud.google.com/vertex-ai/docs/training/configure-compute
endpoint.deploy(
    model = model_v1,
    machine_type="g2-standard-4",
    accelerator_type="NVIDIA_L4",
    #machine_type="a2-highgpu-1g",
    #accelerator_type="NVIDIA_TESLA_A100",
    accelerator_count=1,
    min_replica_count=1,
    max_replica_count=1, #Set this value means do autoscaling
    #service_account=SERVICE_ACCOUNT
    #traffic_percentage=50
    #traffic_split={'a':50, 'b':50}
)

In [None]:
endpoint.gca_resource

In [None]:
service_attachment = endpoint.list_models()[0].private_endpoints.service_attachment
print(service_attachment)

In [None]:
! gcloud compute addresses create psc-prediction \
    --region=us-central1 \
    --subnet=subnet4

In [None]:
! gcloud compute forwarding-rules create op-psc-endpoint \
    --network=globalnetwork \
    --address=psc-prediction \
    --target-service-attachment={service_attachment} \
    --region=us-central1

In [None]:
IP_ADDRESS = ! gcloud compute forwarding-rules describe op-psc-endpoint --region=us-central1 --format='value(IPAddress)'
IP_ADDRESS = IP_ADDRESS[0]
print(IP_ADDRESS)

In [None]:
#Private endpoint with raw predict
from google.cloud import aiplatform
#ENDPOINT_ID = "1745348265257205760"
#endpoint = aiplatform.PrivateEndpoint(ENDPOINT_ID)
response = endpoint.raw_predict(body=json.dumps(prediction_input, indent=2).encode('utf-8'), headers={'Content-Type':'application/json'}, 
                               endpoint_override=IP_ADDRESS)
json.loads(response.data)['choices'][0]['message']['content']

In [None]:
MODEL_NAME_V2 = "Llama-3.1-8B-Instruct-VLLM-V1"
local_model_v2 = LocalModel(serving_container_image_uri=CONTAINER,
                        serving_container_ports=[8080], #expose container port, system map is random
                        serving_container_health_route="/metrics",
                        serving_container_predict_route="/v1/chat/completions",
                        serving_container_args=["python",
                                               "-m",
                                               "vllm.entrypoints.openai.api_server",
                                               "--host=0.0.0.0",
                                               "--port=8080",
                                               "--gpu-memory-utilization=0.9",
                                               "--max-model-len=16384"]
                        )

In [None]:
#Test plain endpoint
import json
import time
with local_model_v2.deploy_to_local_endpoint(
    artifact_uri=MODEL_PATH_V2,
    gpu_count=-1,
    container_ready_timeout = 600
) as local_endpoint:
    predict_response = local_endpoint.predict(
        request=json.dumps(prediction_input),
        headers={"Content-Type": "application/json"},
    )
    health_check_response = local_endpoint.run_health_check()
    time.sleep(30)
    local_endpoint.print_container_logs()
print(health_check_response, health_check_response.content)
print(predict_response.text)

In [None]:
from IPython.display import HTML, Markdown, display
import plotly.graph_objects as go
import pandas as pd
from google.cloud import aiplatform
from vertexai.preview.evaluation import EvalTask
from vertexai.preview.evaluation.metrics import (
    PointwiseMetric,
    PointwiseMetricPromptTemplate,
    TrajectorySingleToolUse,
)

def display_eval_report(eval_result: pd.DataFrame) -> None:
    """Display the evaluation results."""
    metrics_df = pd.DataFrame.from_dict(eval_result.summary_metrics, orient="index").T
    display(Markdown("### Summary Metrics"))
    display(metrics_df)

    display(Markdown(f"### Row-wise Metrics"))
    display(eval_result.metrics_table)
    
prompt = [
    "Turn device_2 power off", # example 1
    "Get user_x preference temperature and set Living Room temperature to the preferred value", # example 2
    "Get user_y preference temperature and set Master Room temperature to the preferred value", # example 3
    "Set all devices off" # example 4
]

reference_trajectory = [
# example 1
[
  {
    "tool_name": "set_device_info",
    "tool_input": {
        "device_id": "device_2",
        "updates": {
            "status": "OFF"
        }
    }
  }
],
# example 2
[
    {
      "tool_name": "get_user_preferences",
      "tool_input": {
          "user_id": "user_x"
      }
  },
  {
      "tool_name": "set_temperature",
      "tool_input": {
          "location": "Living Room",
          "temperature": 23
      }
    },
],
# example 3
[
    {
      "tool_name": "get_user_preferences",
      "tool_input": {
          "user_id": "user_y"
      }
  },
  {
      "tool_name": "set_temperature",
      "tool_input": {
          "location": "Master Room",
          "temperature": 26
      }
    },
],
# example 4
[
  {
    "tool_name": "set_device_info",
    "tool_input": {
        "device_id": "device_1",
        "updates": {
            "status": "OFF"
        }
    }
  },
  {
    "tool_name": "set_device_info",
    "tool_input": {
        "device_id": "device_2",
        "updates": {
            "status": "OFF"
        }
    }
  }
]
]

predicted_trajectory = [
# example 1
[
  {
    "tool_name": "set_device_info",
    "tool_input": {
        "device_id": "device_3",
        "updates": {
            "status": "OFF"
        }
    }
  }
],
# example 2
[
    {
      "tool_name": "get_user_preferences",
      "tool_input": {
          "user_id": "user_z"
      }
    },
    {
      "tool_name": "set_temperature",
      "tool_input": {
          "location": "Living Room",
          "temperature": 23
      }
    },
],
# example 3, does not care about input parameter order
[
    {
      "tool_name": "get_user_preferences",
      "tool_input": {
          "user_id": "user_y"
      }
  },
  {
      "tool_name": "set_temperature",
      "tool_input": {          
          "temperature": 26,
          "location": "Master Room"
      }
    },
],
# example 4, add additional device in route
[
  {
    "tool_name": "set_device_info",
    "tool_input": {
        "device_id": "device_2",
        "updates": {
            "status": "OFF"
        }
    }
  },
  {
    "tool_name": "set_device_info",
    "tool_input": {
        "device_id": "device_1",
        "updates": {
            "status": "OFF"
        }
    }
  },
  {
    "tool_name": "set_device_info",
    "tool_input": {
        "device_id": "device_3",
        "updates": {
            "status": "OFF"
        }
    }
  }
]
]

response = [
    "Device 3 power off",
    "Set Living Room temperature to 23 celcius",
    "Set Master Room temperature to 26 celcius",
    "All devices turned off"
]

eval_dataset = pd.DataFrame({
    "prompt": prompt,
    "predicted_trajectory": predicted_trajectory,
    "reference_trajectory": reference_trajectory,
    "response": response
})


In [None]:
eval_dataset

In [None]:
criteria = {
    "Follows trajectory": (
        "Evaluate whether the agent's response logically follows from the "
        "sequence of actions it took. Consider these sub-points:\n"
        "  - Does the response reflect the information gathered during the trajectory?\n"
        "  - Is the response consistent with the goals and constraints of the task?\n"
        "  - Are there any unexpected or illogical jumps in reasoning?\n"
        "Provide specific examples from the trajectory and response to support your evaluation."
    )
}

pointwise_rating_rubric = {
    "1": "Follows trajectory",
    "0": "Does not follow trajectory",
}

response_follows_trajectory_prompt_template = PointwiseMetricPromptTemplate(
    criteria=criteria,
    rating_rubric=pointwise_rating_rubric,
    input_variables=["prompt", "predicted_trajectory"],
)
print(response_follows_trajectory_prompt_template.prompt_data)
response_follows_trajectory_metric = PointwiseMetric(
    metric="response_follows_trajectory",
    metric_prompt_template=response_follows_trajectory_prompt_template,
)

In [None]:
from vertexai.preview.evaluation import EvalTask
eval_task = EvalTask(
    dataset=eval_dataset,
    metrics=[
        "trajectory_exact_match", # check exactly same 0/1
        "trajectory_in_order_match", # check order matched and have extra functions
        "trajectory_any_order_match", # check order not matched and have extra functions
        "trajectory_precision", #0-1, higher is better, (count(predicted found in reference))/(total number of actions in predicted)
        "trajectory_recall", #0-1, higher is better, (count(reference found in predicted))/(total number of actions in reference)
        "safety",
        response_follows_trajectory_metric
    ],
)

#Use runnable if dynamic generation required, this will generates latency and failure parts
eval_result = eval_task.evaluate(
    #runnable=RUNNABLE,
)

display_eval_report(eval_result)

In [None]:
#Make model v2
model_v2 = aiplatform.Model.upload(
    display_name = MODEL_NAME_V2,
    local_model = local_model_v2,
    artifact_uri = MODEL_PATH_V2,
    parent_model = model_v1.resource_name,
    #is_default_version=True,
    #serving_container_environment_variables={
    #    "VERTEX_CPR_MAX_WORKERS": "1",
    #    "PORT": "8080", #server runs on 5000, or 8080 by dafault
    #    "RUST_BACKTRACE": "full", #for stack trace printing,
    #},
    #serving_container_ports=[8080],
    #serving_container_args = ["--num-shard 1"]
)

In [None]:
#deploy second model (use sameone for testing)
endpoint.deploy(
    model = model_v2,
    machine_type="g2-standard-4",
    accelerator_type="NVIDIA_L4",
    #machine_type="a2-highgpu-1g",
    #accelerator_type="NVIDIA_TESLA_A100",
    accelerator_count=1,
    min_replica_count=1,
    max_replica_count=1,
    #service_account=SERVICE_ACCOUNT
    traffic_percentage=50
    #traffic_split={'a':50, 'b':50}
)

In [None]:
for model_info in endpoint.list_models():
    print("----------------------------")
    print(model_info)

In [None]:
#Traffic to v2 all
endpoint = endpoint.update(
    traffic_split={
                    "[DEPLOYED MODELID(Not model repository)]": 100,
    }
)

In [None]:
#Custom network timeout endpoint
from google.cloud import aiplatform
import urllib3
class CustomPrivateEndpoint(aiplatform.PrivateEndpoint):
    from typing import Optional, Dict, Any
    _SUCCESSFUL_HTTP_RESPONSE = 300
    _TIMEOUT =  urllib3.Timeout(10)
    def setTimeout(self, seconds: float):
        #Sets total timeout, you can separate connection and read
        self._TIMEOUT = urllib3.Timeout(seconds)
    def _http_request(
        self,
        method: str,
        url: str,
        body: Optional[Dict[Any, Any]] = None,
        headers: Optional[Dict[str, str]] = None,
    ) -> "urllib3.response.HTTPResponse":  # type: ignore # noqa: F821
        try:
            response = self._http_client.request(
                method=method, url=url, body=body, headers=headers, timeout=self._TIMEOUT
            )
            if response.status < self._SUCCESSFUL_HTTP_RESPONSE:
                return response
            else:
                raise RuntimeError(
                    f"{response.status} - Failed to make request, see response: "
                    + response.data.decode("utf-8")
                )
        except urllib3.exceptions.MaxRetryError as exc:
            raise RuntimeError(
                f"Failed to make a {method} request to this URI, make sure: "
                " this call is being made inside the network this PrivateEndpoint is peered to "
                f"({self._gca_resource.network}), calling health_check() returns True, "
                f"and that {url} is a valid URL."
            ) from exc

In [None]:
endpoint = CustomPrivateEndpoint("[ENDPOINT ID]")
endpoint.setTimeout(1)
response = endpoint.raw_predict(body=json.dumps(prediction_input, indent=2).encode('utf-8'), headers={'Content-Type':'application/json'}, 
                               endpoint_override=IP_ADDRESS)
print(json.loads(response.data)['choices'][0]['message']['content'])

In [None]:
from google.cloud import aiplatform
ENDPOINT_ID = "[ENDPOINT ID]"
endpoint = aiplatform.PrivateEndpoint(ENDPOINT_ID)
endpoint.undeploy_all()
endpoint.delete()
model_v1.delete()
model_v2.delete()

In [None]:
! gcloud compute forwarding-rules delete op-psc-endpoint --region={LOCATION}  --quiet
! gcloud compute addresses delete psc-prediction --region={LOCATION} --quiet