# Deploy chat model

Let's deploy a provisioned throughput `meta-llama-3-1-3b-instruct` model which we'll use in our dspy program. The model will be hosted on a [Databricks Foundational Model Serving Endpoint](https://docs.databricks.com/en/machine-learning/foundation-models/index.html). Provisioned throughput endpoints can be created [using the API or the Serving UI](https://docs.databricks.com/aws/en/machine-learning/foundation-model-apis/deploy-prov-throughput-foundation-model-apis). You can easily swap out `meta-llama-3-1-3b-instruct` for [other providers or local models](https://github.com/stanfordnlp/dspy/blob/main/examples/migration.ipynb) via the `dspy.LM` command, as shown below.

In [0]:
%pip install -U -qqqq mlflow>=2.18.0
dbutils.library.restartPython()

In [0]:
import requests
import json
import mlflow

In [0]:
mlflow.utils.databricks_utils.get_databricks_host_creds().host

In [0]:
from mlflow.models import ModelConfig

In [0]:
config_file = "config.yaml"
model_config = mlflow.models.ModelConfig(development_config=config_file)

In [0]:
# Set the name of the MLflow endpoint
chat_endpoint_name = model_config.get("chat_endpoint_name")

# Name of the registered MLflow model
chat_model_name = model_config.get("chat_model_name")

# Get the latest version of the MLflow model
chat_model_version = 1

# Get the API endpoint and token for the current notebook context
API_ROOT = mlflow.utils.databricks_utils.get_databricks_host_creds().host

SECRET_SCOPE_NAME = model_config.get("secret_scope_name")
SECRET_SCOPE_KEY = model_config.get("secret_key")

API_TOKEN = dbutils.secrets.get(SECRET_SCOPE_NAME, SECRET_SCOPE_KEY)

headers = {"Context-Type": "text/json", "Authorization": f"Bearer {API_TOKEN}"}

optimizable_info = requests.get(
  url=f"{API_ROOT}/api/2.0/serving-endpoints/get-model-optimization-info/{chat_model_name}/{chat_model_version}",
  headers=headers).json()

if 'optimizable' not in optimizable_info or not optimizable_info['optimizable']:
  raise ValueError("Model is not eligible for provisioned throughput")

chunk_size = optimizable_info['throughput_chunk_size']





In [0]:
scale_to_zero = True

# Minimum desired provisioned throughput
min_provisioned_throughput = 0

# Maximum desired provisioned throughput
max_provisioned_throughput = 3 * chunk_size

In [0]:
# Send the POST request to create the serving endpoint
data = {
  "name": chat_endpoint_name,
  "config": {
    "served_entities": [
      {
        "entity_name": chat_model_name,
        "entity_version": chat_model_version,
        "min_provisioned_throughput": min_provisioned_throughput,
        "max_provisioned_throughput": max_provisioned_throughput,
        "scale_to_zero_enabled": scale_to_zero,
      }
    ]
  },
}

response = requests.post(
  url=f"{API_ROOT}/api/2.0/serving-endpoints", json=data, headers=headers
)

print(json.dumps(response.json(), indent=4))

# Deploy embedding model



In [0]:
# Set the name of the MLflow endpoint
embedding_endpoint_name = model_config.get("embedding_endpoint_name")

# Name of the registered MLflow model
embedding_model_name = model_config.get("embedding_model_name")

# Get the latest version of the MLflow model
embedding_model_version = 1

In [0]:
optimizable_info = requests.get(
  url=f"{API_ROOT}/api/2.0/serving-endpoints/get-model-optimization-info/{embedding_model_name}/{embedding_model_version}",
  headers=headers).json()

if 'optimizable' not in optimizable_info or not optimizable_info['optimizable']:
  raise ValueError("Model is not eligible for provisioned throughput")

chunk_size = optimizable_info['throughput_chunk_size']

In [0]:
scale_to_zero = True

# Minimum desired provisioned throughput
min_provisioned_throughput = 0

# Maximum desired provisioned throughput
max_provisioned_throughput = 3 * chunk_size

In [0]:
# Send the POST request to create the serving endpoint
data = {
  "name": embedding_endpoint_name,
  "config": {
    "served_entities": [
      {
        "entity_name": embedding_model_name,
        "entity_version": embedding_model_version,
        "min_provisioned_throughput": min_provisioned_throughput,
        "max_provisioned_throughput": max_provisioned_throughput,
        "scale_to_zero_enabled": scale_to_zero,
      }
    ]
  },
}

response = requests.post(
  url=f"{API_ROOT}/api/2.0/serving-endpoints", json=data, headers=headers
)

print(json.dumps(response.json(), indent=4))