# DeepSeek R1 Distillation 1.5B vLLM serving using the Azure ML Python SDK

> [1] Please use `Python 3.10 - SDK v2 (azureml_py310_sdkv2)` conda environment.<br>[2] Please make sure you prepare [Hugging Face API Token](https://huggingface.co/docs/hub/security-tokens).

You may need to install `azure-ai-ml` and `azure-identity` before start

In [1]:
# %pip install -q azure-ai-ml azure-identity

## 1. Prepare Environment Parameters

---


In [2]:
import os
import yaml
from logger import logger
from datetime import datetime

snapshot_date = datetime.now().strftime("%Y-%m-%d")
HF_MODEL_NAME_OR_PATH = 'deepseek-ai/DeepSeek-R1-Distill-Llama-70B'

with open("config.yml") as f:
    d = yaml.load(f, Loader=yaml.FullLoader)

AZURE_SUBSCRIPTION_ID = d["config"]["AZURE_SUBSCRIPTION_ID"]
AZURE_RESOURCE_GROUP = d["config"]["AZURE_RESOURCE_GROUP"]
AZURE_WORKSPACE = d["config"]["AZURE_WORKSPACE"]
HF_TOKEN = d["config"]["HF_TOKEN"]
IS_DEBUG = d["config"]["IS_DEBUG"]

azure_env_name = 'ds-llama-70b-env-bnb'
azure_model_name = 'ds-llama-70b-bnb-model'
azure_endpoint_name = 'ds-llama-70b-bnb-ep'
azure_deployment_name = 'blue'
azure_serving_cluster_size = 'Standard_ND40rs_v2'


if IS_DEBUG:
    logger.debug("===== 0. Azure ML Deployment Info =====")
    logger.debug(f"AZURE_SUBSCRIPTION_ID={AZURE_SUBSCRIPTION_ID}")
    logger.debug(f"AZURE_RESOURCE_GROUP={AZURE_RESOURCE_GROUP}")
    logger.debug(f"AZURE_WORKSPACE={AZURE_WORKSPACE}")
    logger.debug(f"HF_MODEL_NAME_OR_PATH={HF_MODEL_NAME_OR_PATH}")

    logger.debug(f"azure_env_name={azure_env_name}")
    logger.debug(f"azure_model_name={azure_model_name}")
    logger.debug(f"azure_endpoint_name={azure_endpoint_name}")
    logger.debug(f"azure_deployment_name={azure_deployment_name}")
    logger.debug(f"azure_serving_cluster_size={azure_serving_cluster_size}")

2025-03-05 15:25:00,033 - logger - DEBUG - ===== 0. Azure ML Deployment Info =====
2025-03-05 15:25:00,034 - logger - DEBUG - AZURE_SUBSCRIPTION_ID=e56790f8-0506-49eb-95b8-82817828d59d
2025-03-05 15:25:00,035 - logger - DEBUG - AZURE_RESOURCE_GROUP=dev
2025-03-05 15:25:00,035 - logger - DEBUG - AZURE_WORKSPACE=test-vllm
2025-03-05 15:25:00,036 - logger - DEBUG - HF_MODEL_NAME_OR_PATH=deepseek-ai/DeepSeek-R1-Distill-Llama-70B
2025-03-05 15:25:00,037 - logger - DEBUG - azure_env_name=ds-llama-70b-env-bnb
2025-03-05 15:25:00,037 - logger - DEBUG - azure_model_name=ds-llama-70b-bnb-model
2025-03-05 15:25:00,038 - logger - DEBUG - azure_endpoint_name=ds-llama-70b-bnb-ep
2025-03-05 15:25:00,038 - logger - DEBUG - azure_deployment_name=blue
2025-03-05 15:25:00,039 - logger - DEBUG - azure_serving_cluster_size=Standard_ND40rs_v2


<br>

## 2. Serving preparation

---

### 2.1. Configure workspace details

To connect to a workspace, we need identifying parameters - a subscription, a resource group, and a workspace name. We will use these details in the MLClient from azure.ai.ml to get a handle on the Azure Machine Learning workspace we need. We will use the default Azure authentication for this hands-on.


In [3]:
# import required libraries
import time
from azure.identity import DefaultAzureCredential, InteractiveBrowserCredential
from azure.ai.ml import MLClient, Input
from azure.ai.ml import command
from azure.ai.ml.entities import Model
from azure.ai.ml.constants import AssetTypes
from azure.core.exceptions import ResourceNotFoundError, ResourceExistsError

logger.info(f"===== 2. Serving preparation =====")
logger.info(f"Calling DefaultAzureCredential.")
credential = DefaultAzureCredential()
ml_client = None
try:
    ml_client = MLClient.from_config(credential)
except Exception as ex:
    print(ex)
    ml_client = MLClient(
        credential, AZURE_SUBSCRIPTION_ID, AZURE_RESOURCE_GROUP, AZURE_WORKSPACE
    )

2025-03-05 15:25:02,481 - logger - INFO - ===== 2. Serving preparation =====
2025-03-05 15:25:02,482 - logger - INFO - Calling DefaultAzureCredential.
Found the config file in: /config.json


### 2.2. Create model asset


In [4]:
def get_or_create_model_asset(
    ml_client,
    model_name,
    job_name=None,
    model_dir="outputs",
    model_type="custom_model",
    update=False,
):
    try:
        latest_model_version = max(
            [int(m.version) for m in ml_client.models.list(name=model_name)]
        )
        if update:
            raise ResourceExistsError("Found Model asset, but will update the Model.")
        else:
            model_asset = ml_client.models.get(
                name=model_name, version=latest_model_version
            )
            logger.info(f"Found Model asset: {model_name}. Will not create again")
    except (ResourceNotFoundError, ResourceExistsError) as e:
        logger.info(f"Exception: {e}")
        run_model = Model(
            name=model_name,
            path=model_dir,
            description="Model created from run.",
            type=model_type,  # mlflow_model, custom_model, triton_model
        )
        model_asset = ml_client.models.create_or_update(run_model)
        logger.info(f"Created Model asset: {model_name} from {model_dir}")

    return model_asset

In [5]:
model = get_or_create_model_asset(
    ml_client,
    azure_model_name,
    job_name=None,
    model_dir="deepseek-adapter", # dummy model file folder
    model_type="custom_model",
    update=False,
)

2025-03-05 15:25:04,020 - logger - INFO - Exception: (UserError) The specified resource was not found.
Code: UserError
Message: The specified resource was not found.
Exception Details:	(ModelNotFound) Model container with name: ds-llama-70b-bnb-model not found.
	Code: ModelNotFound
	Message: Model container with name: ds-llama-70b-bnb-model not found.
2025-03-05 15:25:09,375 - logger - INFO - Created Model asset: ds-llama-70b-bnb-model from deepseek-adapter


#### Docker environment


In [6]:
from azure.ai.ml.entities import Environment, BuildContext


def get_or_create_docker_environment_asset(
    ml_client, env_name, docker_dir, inference_config=None, update=False
):

    try:
        latest_env_version = max(
            [int(e.version) for e in ml_client.environments.list(name=env_name)]
        )
        if update:
            raise ResourceExistsError(
                "Found Environment asset, but will update the Environment."
            )
        else:
            env_asset = ml_client.environments.get(
                name=env_name, version=latest_env_version
            )
            logger.info(f"Found Environment asset: {env_name}. Will not create again")
    except (ResourceNotFoundError, ResourceExistsError) as e:
        logger.info(f"Exception: {e}")
        env_docker_image = Environment(
            build=BuildContext(path=docker_dir),
            name=env_name,
            description="Environment created from a Docker context.",
            inference_config=inference_config,
        )
        env_asset = ml_client.environments.create_or_update(env_docker_image)
        logger.info(f"Created Environment asset: {env_name}")

    return env_asset


inference_config = {
    "liveness_route": {
        "port": 8000,
        "path": "/health",
    },
    "readiness_route": {
        "port": 8000,
        "path": "/health",
    },
    "scoring_route": {
        "port": 8000,
        "path": "/",
    },
}

env = get_or_create_docker_environment_asset(
    ml_client, azure_env_name, "docker", inference_config, update=True
)

2025-03-05 15:25:09,984 - logger - INFO - Exception: (UserError) System.Net.Http.HttpConnectionResponseContent
Code: UserError
Message: System.Net.Http.HttpConnectionResponseContent
[32mUploading docker (0.0 MBs): 100%|██████████| 350/350 [00:00<00:00, 11657.41it/s]
[39m

2025-03-05 15:25:17,965 - logger - INFO - Created Environment asset: ds-llama-70b-env-bnb


<br>

## 3. Serving

---

### 3.1. Create endpoint

Create an endpoint. This process does not provision a GPU cluster yet.


In [7]:
%%time

from azure.ai.ml.entities import (
    ManagedOnlineEndpoint,
    IdentityConfiguration,
    ManagedIdentityConfiguration,
)

logger.info(f"===== 3. Serving =====")

t0 = time.time()

# Check if the endpoint already exists in the workspace
try:
    endpoint = ml_client.online_endpoints.get(azure_endpoint_name)
    logger.info("---Endpoint already exists---")
except:
    # Create an online endpoint if it doesn't exist

    # Define the endpoint
    endpoint = ManagedOnlineEndpoint(
        name=azure_endpoint_name,
        description=f"Test endpoint for {model.name}",
    )

# Trigger the endpoint creation
try:
    ml_client.begin_create_or_update(endpoint).wait()
    logger.info("\n---Endpoint created successfully---\n")
except Exception as err:
    raise RuntimeError(f"Endpoint creation failed. Detailed Response:\n{err}") from err

t1 = time.time()

from humanfriendly import format_timespan

timespan = format_timespan(t1 - t0)
logger.info(f"Creating Endpoint took {timespan}")

2025-03-05 15:25:17,974 - logger - INFO - ===== 3. Serving =====
2025-03-05 15:26:22,362 - logger - INFO - 
---Endpoint created successfully---

2025-03-05 15:26:22,377 - logger - INFO - Creating Endpoint took 1 minute and 4.39 seconds


CPU times: user 96.1 ms, sys: 7.98 ms, total: 104 ms
Wall time: 1min 4s


### 3.2. Create Deployment

Create a Deployment. This takes a lot of time as GPU clusters must be provisioned and the serving environment must be built.


In [8]:
env_vars = {
    "MODEL_NAME": HF_MODEL_NAME_OR_PATH,
    "VLLM_ARGS": "--tensor-parallel-size 8 --max-model-len 32768 --enforce-eager --quantization bitsandbytes --load-format bitsandbytes --dtype float16",
    "HUGGING_FACE_HUB_TOKEN": HF_TOKEN,
}
deployment_env_vars = {**env_vars}

In [11]:
%%time

from azure.ai.ml.entities import (    
    OnlineRequestSettings,
    CodeConfiguration,
    ManagedOnlineDeployment,
    ProbeSettings,
    Environment
)

deployment = ManagedOnlineDeployment(
    name=azure_deployment_name,
    endpoint_name=azure_endpoint_name,
    model=model,
    instance_type=azure_serving_cluster_size,
    instance_count=1,
    environment_variables=deployment_env_vars,    
    environment=env,
    request_settings=OnlineRequestSettings(
        max_concurrent_requests_per_instance=2,
        request_timeout_ms=120000, 
        max_queue_wait_ms=240000
    ),
    liveness_probe=ProbeSettings(
        failure_threshold=5,
        success_threshold=1,
        timeout=10,
        period=30,
        initial_delay=120
    ),
    readiness_probe=ProbeSettings(
        failure_threshold=30,
        success_threshold=1,
        timeout=2,
        period=10,
        initial_delay=120,
    ),
)

# Trigger the deployment creation
try:
    ml_client.begin_create_or_update(deployment).wait()
    logger.info("\n---Deployment created successfully---\n")
except Exception as err:
    raise RuntimeError(
        f"Deployment creation failed. Detailed Response:\n{err}"
    ) from err
    
endpoint.traffic = {azure_deployment_name: 100}
endpoint_poller = ml_client.online_endpoints.begin_create_or_update(endpoint)

Check: endpoint ds-llama-70b-bnb-ep exists


............................................................................................................................................................................................................................................................................................................................................

2025-03-05 16:01:40,438 - logger - INFO - 
---Deployment created successfully---



CPU times: user 4.15 s, sys: 289 ms, total: 4.43 s
Wall time: 28min 52s


In [12]:
endpoint_results = endpoint_poller.result()

In [13]:
print(endpoint_results)

auth_mode: key
description: Test endpoint for ds-llama-70b-bnb-model
id: /subscriptions/e56790f8-0506-49eb-95b8-82817828d59d/resourceGroups/dev/providers/Microsoft.MachineLearningServices/workspaces/test-vllm/onlineEndpoints/ds-llama-70b-bnb-ep
identity:
  principal_id: b88b1426-4b8c-4349-9b7c-09742b4a5e7c
  tenant_id: 16b3c013-d300-468d-ac64-7eda0820b6d3
  type: system_assigned
kind: Managed
location: westeurope
mirror_traffic: {}
name: ds-llama-70b-bnb-ep
openapi_uri: https://ds-llama-70b-bnb-ep.westeurope.inference.ml.azure.com/swagger.json
properties:
  AzureAsyncOperationUri: https://management.azure.com/subscriptions/e56790f8-0506-49eb-95b8-82817828d59d/providers/Microsoft.MachineLearningServices/locations/westeurope/mfeOperationsStatus/oeidp:10331bed-c16d-4680-834e-e054c4eeaeb9:cbb8b891-c853-4ec3-bbdf-bf192102eb99?api-version=2022-02-01-preview
  azureml.onlineendpointid: /subscriptions/e56790f8-0506-49eb-95b8-82817828d59d/resourcegroups/dev/providers/microsoft.machinelearningse

In [14]:
endpoint_name = endpoint_results.name
keys = ml_client.online_endpoints.get_keys(name=endpoint_name)
primary_key = keys.primary_key

<br>

## 4. Test

---

### 4.1. Invocation

In [16]:
# Create your prompt
system_message = """
You are an AI assistant that helps customers find information. As an assistant, you respond to questions in a concise and unique manner.
You can use Markdown to answer simply and concisely, and add a personal touch with appropriate emojis.

Add a witty joke starting with "By the way," at the end of your response. Do not mention the customer's name in the joke part.
The joke should be related to the specific question asked.
For example, if the question is about tents, the joke should be specifically related to tents.

Use the given context to provide a more personalized response. Write each sentence on a new line:
"""
context = """
    The Alpine Explorer Tent features a detachable partition to ensure privacy, 
    numerous mesh windows and adjustable vents for ventilation, and a waterproof design. 
    It also includes a built-in gear loft for storing outdoor essentials. 
    In short, it offers a harmonious blend of privacy, comfort, and convenience, making it a second home in nature!
"""
question = "What are features of the Alpine Explorer Tent?"

user_message = f"""
Context: {context}
Question: {question}
"""

Simple API Call

In [17]:
endpoint_remote = ml_client.online_endpoints.get(name=azure_endpoint_name)

In [18]:
endpoint_remote.scoring_uri

'https://ds-llama-70b-bnb-ep.westeurope.inference.ml.azure.com/'

In [19]:
DATA_PLANE_TOKEN = ml_client.online_endpoints.get_keys(name=azure_endpoint_name).primary_key

In [20]:
import requests
from pprint import pprint

url = os.path.join(endpoint_remote.scoring_uri, "v1/completions")

api_key = DATA_PLANE_TOKEN
# Set the headers
headers = {
    "Content-Type": "application/json",
    "Authorization": f"Bearer {api_key}"
}

model_path = "deepseek-ai/DeepSeek-R1-Distill-Llama-70B"

prompt = """You are a helpful assistant.
### Instruction:
How to explain Internet for a medieval knight?
### Response:"""

data = {
    "model": model_path,
    "prompt": prompt,
    "max_tokens": 600,
    "temperature": 0.7
}

response = requests.post(url, headers=headers, json=data, timeout=120)


In [21]:
print(response.json()['choices'][0]['text'])

 
Imagine the Internet as a grand network of messengers, each carrying information between castles and villages across the land. These messengers move at the speed of light, allowing communication over vast distances in mere moments. The Internet is akin to a magical book where all the knowledge of the world is stored, accessible by anyone with the right tools. It's a place where people can converse with others far away, share ideas, and learn from one another, much like the gatherings in a great hall but on a much larger scale.

---

Okay, so I need to explain the Internet to a medieval knight. Let me think about how to approach this. The knight is from a time without modern technology, so I should use analogies they understand.

Maybe compare the Internet to something like a network of messengers. Messengers were crucial in medieval times for communication over distances. So, I can say the Internet is like a huge system of messengers, but instead of horses, they move at the speed of 

### 4.2. LLM latency/throughput simple benchmarking


In [22]:
import numpy as np
from time import perf_counter


def simple_llm_benchmark(
    messages: list,
    model_path: str = "deepseek-ai/DeepSeek-R1-Distill-Llama-70B",
    num_warmups: int = 1,
    num_infers: int = 5,
    **params: dict,
) -> dict:

    print("=== Measuring latency ===")
    print(f"model_path={model_path}, num_infers={num_infers}, params={params}")

    latencies = []
    # Warm up
    for _ in range(num_warmups):
        response = requests.post(url, headers=headers, json=data, timeout=120)
    print("=== Warmup done. Start Benchmarking... ===")
    begin = time.time()
    # Timed run
    for curr_infer in range(num_infers):
        start_time = perf_counter()
        if (curr_infer % 5) == 0:
            print(f"Inferring {curr_infer}th...")
        response = requests.post(url, headers=headers, json=data, timeout=120)
        latency = perf_counter() - start_time
        latencies.append(latency)
    end = time.time()

    # Compute run statistics
    duration = end - begin
    time_avg_sec = np.mean(latencies)
    time_std_sec = np.std(latencies)
    time_p95_sec = np.percentile(latencies, 95)
    time_p99_sec = np.percentile(latencies, 99)

    # Metrics
    metrics = {
        "duration": duration,
        "avg_sec": time_avg_sec,
        "std_sec": time_std_sec,
        "p95_sec": time_p95_sec,
        "p99_sec": time_p99_sec,
    }

    return metrics

In [23]:
messages = [
    {"role": "system", "content": system_message},
    {"role": "user", "content": user_message},
]
params = {
    "max_tokens": 100,
    "temperature": 0.5,
}

model_path = "deepseek-ai/DeepSeek-R1-Distill-Llama-70B"

metrics = simple_llm_benchmark(
    messages, model_path=model_path, num_warmups=1, num_infers=10, **params
)

=== Measuring latency ===
model_path=deepseek-ai/DeepSeek-R1-Distill-Llama-70B, num_infers=10, params={'max_tokens': 100, 'temperature': 0.5}
=== Warmup done. Start Benchmarking... ===
Inferring 0th...
Inferring 5th...


In [24]:
import pprint

pprint.pprint(metrics)

{'avg_sec': np.float64(59.12350703740958),
 'duration': 591.2350876331329,
 'p95_sec': np.float64(71.10420313944341),
 'p99_sec': np.float64(71.14020671025152),
 'std_sec': np.float64(13.776175047275308)}


## Clean up


In [25]:
ml_client.online_endpoints.begin_delete(azure_endpoint_name)

<azure.core.polling._poller.LROPoller at 0x7f492f6bef50>

......................................................................................