# DeepSeek R1-UD-IQ1_M llama.cpp serving using the Azure ML Python SDK

> [1] Please use `Python 3.10 - SDK v2 (azureml_py310_sdkv2)` conda environment.<br>[2] Please make sure you prepare [Hugging Face API Token](https://huggingface.co/docs/hub/security-tokens).

## Download Preprocessed Quantized Models

You may first install `huggingface-cli` to download huggingface models.

In [None]:
%pip install -U "huggingface_hub[cli]"

In [None]:
!huggingface-cli login --token <your token>

Token has not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: fineGrained).
Your token has been saved to /home/azureuser/.cache/huggingface/token
Login successful


Now you can download your model as model assets in Azure Storage. Below will download `DeepSeek-R1-GGUF/DeepSeek-R1-UD-IQ1_M` for this notebook.

In [None]:
!huggingface-cli download unsloth/DeepSeek-R1-GGUF --include "*UD-IQ1_M*" --local-dir DeepSeek-R1-GGUF --cache-dir .cache

## 1. Load config file

---


In [None]:
import os
import yaml
from logger import logger
from datetime import datetime

snapshot_date = datetime.now().strftime("%Y-%m-%d")

with open("config.yml") as f:
    d = yaml.load(f, Loader=yaml.FullLoader)

AZURE_SUBSCRIPTION_ID = d["config"]["AZURE_SUBSCRIPTION_ID"]
AZURE_RESOURCE_GROUP = d["config"]["AZURE_RESOURCE_GROUP"]
AZURE_WORKSPACE = d["config"]["AZURE_WORKSPACE"]
HF_TOKEN = ''
HF_MODEL_NAME_OR_PATH = d["config"]["HF_MODEL_NAME_OR_PATH"]
IS_DEBUG = d["config"]["IS_DEBUG"]

azure_env_name = 'deepseek-llm-cpp'
azure_model_name = 'deepseek-r1-m'
azure_endpoint_name = 'deepseek-r1-m-endpoint'
azure_deployment_name = 'blue'
azure_serving_cluster_size = 'Standard_ND40rs_v2'

if IS_DEBUG:
    logger.debug("===== 0. Azure ML Deployment Info =====")
    logger.debug(f"AZURE_SUBSCRIPTION_ID={AZURE_SUBSCRIPTION_ID}")
    logger.debug(f"AZURE_RESOURCE_GROUP={AZURE_RESOURCE_GROUP}")
    logger.debug(f"AZURE_WORKSPACE={AZURE_WORKSPACE}")
    logger.debug(f"HF_MODEL_NAME_OR_PATH={HF_MODEL_NAME_OR_PATH}")
    logger.debug(f"IS_DEBUG={IS_DEBUG}")

    logger.debug(f"azure_env_name={azure_env_name}")
    logger.debug(f"azure_model_name={azure_model_name}")
    logger.debug(f"azure_endpoint_name={azure_endpoint_name}")
    logger.debug(f"azure_deployment_name={azure_deployment_name}")
    logger.debug(f"azure_serving_cluster_size={azure_serving_cluster_size}")

2025-03-05 13:51:09,079 - logger - DEBUG - ===== 0. Azure ML Deployment Info =====
2025-03-05 13:51:09,079 - logger - DEBUG - AZURE_SUBSCRIPTION_ID=<your-subscription-id>
2025-03-05 13:51:09,079 - logger - DEBUG - AZURE_RESOURCE_GROUP=<your-resource-group>
2025-03-05 13:51:09,079 - logger - DEBUG - AZURE_WORKSPACE=<your-workspace-name>
2025-03-05 13:51:09,079 - logger - DEBUG - HF_MODEL_NAME_OR_PATH=DeepSeek-R1-GGUF/DeepSeek-R1-UD-IQ1_M
2025-03-05 13:51:09,079 - logger - DEBUG - IS_DEBUG=True
2025-03-05 13:51:09,079 - logger - DEBUG - azure_env_name=deepseek-llm-cpp
2025-03-05 13:51:09,079 - logger - DEBUG - azure_model_name=deepseek-r1-m
2025-03-05 13:51:09,079 - logger - DEBUG - azure_endpoint_name=deepseek-r1-m-endpoint
2025-03-05 13:51:09,079 - logger - DEBUG - azure_deployment_name=blue
2025-03-05 13:51:09,079 - logger - DEBUG - azure_serving_cluster_size=Standard_ND40rs_v2


<br>

## 2. Serving preparation

---

### 2.1. Configure workspace details

To connect to a workspace, we need identifying parameters - a subscription, a resource group, and a workspace name. We will use these details in the MLClient from azure.ai.ml to get a handle on the Azure Machine Learning workspace we need. We will use the default Azure authentication for this hands-on.


In [4]:
%pip install -q azure-ai-ml azure-identity

Note: you may need to restart the kernel to use updated packages.


In [5]:
# import required libraries
import time
from azure.identity import DefaultAzureCredential, InteractiveBrowserCredential
from azure.ai.ml import MLClient, Input
from azure.ai.ml import command
from azure.ai.ml.entities import Model
from azure.ai.ml.constants import AssetTypes
from azure.core.exceptions import ResourceNotFoundError, ResourceExistsError

logger.info(f"===== 2. Serving preparation =====")
logger.info(f"Calling DefaultAzureCredential.")
credential = DefaultAzureCredential()
ml_client = None
try:
    ml_client = MLClient.from_config(credential)
except Exception as ex:
    print(ex)
    ml_client = MLClient(
        credential, AZURE_SUBSCRIPTION_ID, AZURE_RESOURCE_GROUP, AZURE_WORKSPACE
    )

2025-03-02 07:38:01,054 - logger - INFO - ===== 2. Serving preparation =====
2025-03-02 07:38:01,055 - logger - INFO - Calling DefaultAzureCredential.
Found the config file in: /config.json


### 2.2. Create model asset


In [6]:
def get_or_create_model_asset(
    ml_client,
    model_name,
    job_name=None,
    model_dir="outputs",
    model_type="custom_model",
    update=False,
):
    try:
        latest_model_version = max(
            [int(m.version) for m in ml_client.models.list(name=model_name)]
        )
        if update:
            raise ResourceExistsError("Found Model asset, but will update the Model.")
        else:
            model_asset = ml_client.models.get(
                name=model_name, version=latest_model_version
            )
            logger.info(f"Found Model asset: {model_name}. Will not create again")
    except (ResourceNotFoundError, ResourceExistsError) as e:
        logger.info(f"Exception: {e}")
        if job_name is None:
            model_path = model_dir
        else:
            model_path = (
                f"azureml://jobs/{job_name}/outputs/artifacts/paths/{model_dir}/"
            )
        run_model = Model(
            name=model_name,
            path=model_path,
            description="Model created from run.",
            type=model_type,  # mlflow_model, custom_model, triton_model
        )
        model_asset = ml_client.models.create_or_update(run_model)
        logger.info(f"Created Model asset: {model_name}")

    return model_asset

In [None]:
model = get_or_create_model_asset(
    ml_client,
    azure_model_name,
    job_name=None,
    model_dir="DeepSeek-R1-GGUF/DeepSeek-R1-UD-IQ1_M",
    model_type="custom_model",
    update=False,
)

2025-03-02 07:38:26,957 - logger - INFO - Exception: (UserError) The specified resource was not found.
Code: UserError
Message: The specified resource was not found.
Exception Details:	(ModelNotFound) Model container with name: deepseek-r1-m not found.
	Code: ModelNotFound
	Message: Model container with name: deepseek-r1-m not found.
Your file exceeds 100 MB. If you experience low speeds, latency, or broken connections, we recommend using the AzCopyv10 tool for this file transfer.

Example: azcopy copy '/mnt/batch/tasks/shared/LS_root/mounts/clusters/jihualiu3/code/llm-inferencing/DeepSeek-R1-GGUF/DeepSeek-R1-UD-IQ1_M' 'https://testvllm3226465244.blob.core.windows.net/azureml-blobstore-10331bed-c16d-4680-834e-e054c4eeaeb9/LocalUpload/3f4231739782c924996c973854896c6c/DeepSeek-R1-UD-IQ1_M' 

See https://learn.microsoft.com/azure/storage/common/storage-use-azcopy-v10 for more information.
[32mUploading DeepSeek-R1-UD-IQ1_M (168916.28 MBs):  12%|█▏        | 19942643040/168916283648 [06:10

### 2.3. Create AzureML environment

Azure ML defines containers (called environment asset) in which your code will run. We can use the built-in environment or build a custom environment (Docker container, conda). This hands-on uses Docker container.


#### Docker environment


In [19]:
from azure.ai.ml.entities import Environment, BuildContext


def get_or_create_docker_environment_asset(
    ml_client, env_name, docker_dir, inference_config=None, update=False
):

    try:
        latest_env_version = max(
            [int(e.version) for e in ml_client.environments.list(name=env_name)]
        )
        if update:
            raise ResourceExistsError(
                "Found Environment asset, but will update the Environment."
            )
        else:
            env_asset = ml_client.environments.get(
                name=env_name, version=latest_env_version
            )
            logger.info(f"Found Environment asset: {env_name}. Will not create again")
    except (ResourceNotFoundError, ResourceExistsError) as e:
        logger.info(f"Exception: {e}")
        env_docker_image = Environment(
            build=BuildContext(path=docker_dir),
            name=env_name,
            description="Environment created from a Docker context.",
            inference_config=inference_config,
        )
        env_asset = ml_client.environments.create_or_update(env_docker_image)
        logger.info(f"Created Environment asset: {env_name}")

    return env_asset


inference_config = {
    "liveness_route": {
        "port": 8000,
        "path": "/health",
    },
    "readiness_route": {
        "port": 8000,
        "path": "/health",
    },
    "scoring_route": {
        "port": 8000,
        "path": "/",
    },
}

env = get_or_create_docker_environment_asset(
    ml_client, azure_env_name, "docker-r1", inference_config, update=True
)

2025-03-02 09:50:35,847 - logger - INFO - Exception: Found Environment asset, but will update the Environment.
[32mUploading docker-r1 (0.0 MBs): 100%|██████████| 1072/1072 [00:00<00:00, 12249.71it/s]
[39m

2025-03-02 09:50:45,869 - logger - INFO - Created Environment asset: deepseek-llm-cpp


<br>

## 3. Serving

---

### 3.1. Create endpoint

Create an endpoint. This process does not provision a GPU cluster yet.


In [None]:
from azure.ai.ml.entities import (
    ManagedOnlineEndpoint,
)

logger.info(f"===== 3. Serving =====")

t0 = time.time()

# Check if the endpoint already exists in the workspace
try:
    endpoint = ml_client.online_endpoints.get(azure_endpoint_name)
    logger.info("---Endpoint already exists---")
except:
    # Create an online endpoint if it doesn't exist

    # Define the endpoint
    endpoint = ManagedOnlineEndpoint(
        name=azure_endpoint_name,
        description=f"Test endpoint for {model.name}",
    )

    # Trigger the endpoint creation
    try:
        ml_client.begin_create_or_update(endpoint).wait()
        logger.info("\n---Endpoint created successfully---\n")
    except Exception as err:
        raise RuntimeError(f"Endpoint creation failed. Detailed Response:\n{err}") from err

t1 = time.time()

from humanfriendly import format_timespan

timespan = format_timespan(t1 - t0)
logger.info(f"Creating Endpoint took {timespan}")

2025-03-02 09:50:45,911 - logger - INFO - ===== 3. Serving =====
2025-03-02 09:50:46,286 - logger - INFO - ---Endpoint already exists---
Readonly attribute principal_id will be ignored in class <class 'azure.ai.ml._restclient.v2022_05_01.models._models_py3.ManagedServiceIdentity'>
Readonly attribute tenant_id will be ignored in class <class 'azure.ai.ml._restclient.v2022_05_01.models._models_py3.ManagedServiceIdentity'>
2025-03-02 09:52:18,738 - logger - INFO - 
---Endpoint created successfully---

2025-03-02 09:52:18,751 - logger - INFO - Creating Endpoint took 1 minute and 32.81 seconds


### 3.2. Create Deployment

Create a Deployment. This takes a lot of time as GPU clusters must be provisioned and the serving environment must be built.


In [21]:
env_vars = {
    "MODEL_NAME": '/models/DeepSeek-R1-UD-IQ1_M/DeepSeek-R1-UD-IQ1_M-00001-of-00004.gguf', ## /var/azureml-app/azureml-models/deepseek-adapter/DeepSeek-R1-Distill-Qwen-1.5B
    "LAYER_N": "61",
    "PREDICT_N": "10000",
    "VLLM_ARGS": "",
}
deployment_env_vars = {**env_vars}

In [None]:
%%time
import time
from azure.ai.ml.entities import (    
    OnlineRequestSettings,
    CodeConfiguration,
    ManagedOnlineDeployment,
    ProbeSettings,
    Environment
)

t0 = time.time()
deployment = ManagedOnlineDeployment(
    name=azure_deployment_name,
    endpoint_name=azure_endpoint_name,
    model=model,
    model_mount_path='/models',
    instance_type=azure_serving_cluster_size,
    instance_count=1,
    environment_variables=deployment_env_vars,    
    environment=env,
    request_settings=OnlineRequestSettings(
        max_concurrent_requests_per_instance=2,
        request_timeout_ms=120000, 
        max_queue_wait_ms=240000
    ),
    liveness_probe=ProbeSettings(
        failure_threshold=5,
        success_threshold=1,
        timeout=10,
        period=30,
        initial_delay=120
    ),
    readiness_probe=ProbeSettings(
        failure_threshold=30,
        success_threshold=1,
        timeout=2,
        period=10,
        initial_delay=120,
    ),
)

# Trigger the deployment creation
try:
    ml_client.begin_create_or_update(deployment).wait()
    logger.info("\n---Deployment created successfully---\n")
except Exception as err:
    raise RuntimeError(
        f"Deployment creation failed. Detailed Response:\n{err}"
    ) from err
    
endpoint.traffic = {azure_deployment_name: 100}
endpoint_poller = ml_client.online_endpoints.begin_create_or_update(endpoint)

t1 = time.time()
timespan = format_timespan(t1 - t0)
logger.info(f"Creating deployment took {timespan}")

Check: endpoint deepseek-r1-m-endpoint exists


..........................................................................................................................................................................................

2025-03-02 10:08:15,533 - logger - INFO - 
---Deployment created successfully---

Readonly attribute principal_id will be ignored in class <class 'azure.ai.ml._restclient.v2022_05_01.models._models_py3.ManagedServiceIdentity'>
Readonly attribute tenant_id will be ignored in class <class 'azure.ai.ml._restclient.v2022_05_01.models._models_py3.ManagedServiceIdentity'>
2025-03-02 10:08:17,075 - logger - INFO - Creating deployment took 15 minutes and 58.26 seconds


CPU times: user 2.3 s, sys: 231 ms, total: 2.54 s
Wall time: 15min 58s


In [23]:
endpoint_results = endpoint_poller.result()

In [24]:
print(endpoint_results)

auth_mode: key
description: Test endpoint for deepseek-r1-m
id: /subscriptions/e56790f8-0506-49eb-95b8-82817828d59d/resourceGroups/dev/providers/Microsoft.MachineLearningServices/workspaces/test-vllm/onlineEndpoints/deepseek-r1-m-endpoint
identity:
  principal_id: 5128e5cf-e952-496e-beeb-ebff815a1c1c
  tenant_id: 16b3c013-d300-468d-ac64-7eda0820b6d3
  type: system_assigned
kind: Managed
location: westeurope
mirror_traffic: {}
name: deepseek-r1-m-endpoint
openapi_uri: https://deepseek-r1-m-endpoint.westeurope.inference.ml.azure.com/swagger.json
properties:
  AzureAsyncOperationUri: https://management.azure.com/subscriptions/e56790f8-0506-49eb-95b8-82817828d59d/providers/Microsoft.MachineLearningServices/locations/westeurope/mfeOperationsStatus/oeidp:10331bed-c16d-4680-834e-e054c4eeaeb9:4ed34a2c-e898-4e22-a232-a94b2a3b976e?api-version=2022-02-01-preview
  azureml.onlineendpointid: /subscriptions/e56790f8-0506-49eb-95b8-82817828d59d/resourcegroups/dev/providers/microsoft.machinelearningse

In [25]:
endpoint_name = endpoint_results.name
keys = ml_client.online_endpoints.get_keys(name=endpoint_name)
primary_key = keys.primary_key

<br>

## 4. Test

---

### 4.1. Invocation

Try calling the endpoint.


In [26]:
%pip install -q openai

Note: you may need to restart the kernel to use updated packages.


In [27]:
from openai import OpenAI

url = os.path.join(endpoint_results.scoring_uri, "v1")
endpoint_name = (
    endpoint_results.name if azure_endpoint_name is None else azure_endpoint_name
)
keys = ml_client.online_endpoints.get_keys(name=endpoint_name)
primary_key = keys.primary_key  # You can paste [YOUR Azure ML API KEY] here
llm = OpenAI(base_url=url, api_key=primary_key)
model_path = "/models/DeepSeek-R1-GGUF/DeepSeek-R1-UD-IQ2_XXS"

In [28]:
# Create your prompt
system_message = """
You are an AI assistant that helps customers find information. As an assistant, you respond to questions in a concise and unique manner.
You can use Markdown to answer simply and concisely, and add a personal touch with appropriate emojis.

Add a witty joke starting with "By the way," at the end of your response. Do not mention the customer's name in the joke part.
The joke should be related to the specific question asked.
For example, if the question is about tents, the joke should be specifically related to tents.

Use the given context to provide a more personalized response. Write each sentence on a new line:
"""
context = """
    The Alpine Explorer Tent features a detachable partition to ensure privacy, 
    numerous mesh windows and adjustable vents for ventilation, and a waterproof design. 
    It also includes a built-in gear loft for storing outdoor essentials. 
    In short, it offers a harmonious blend of privacy, comfort, and convenience, making it a second home in nature!
"""
question = "What are features of the Alpine Explorer Tent?"

user_message = f"""
Context: {context}
Question: {question}
"""

Simple API Call

In [29]:
# Simple API Call
response = llm.chat.completions.create(
    model=model_path,
    messages=[
        {"role": "system", "content": system_message},
        {"role": "user", "content": user_message},
    ],
    temperature=0.7,
    max_tokens=200,
)

print(response.choices[0].message.content)

<think>
Okay, the user is asking about the features of the Alpine Explorer Tent. Let me check the context provided. The tent has a detachable partition for privacy, several mesh windows and adjustable vents for ventilation, a waterproof design, and a built-in gear loft. I need to list these features clearly.

I should start each sentence on a new line and use emojis to make it friendly. Then add a joke related to tents starting with "By the way," without mentioning the user's name. The joke should be witty and connected to the features. Maybe something about the tent being so nice it doesn’t want to leave? Or maybe about the gear loft being a secret storage spot. Hmm, the gear loft is for essentials, so a joke about hiding snacks there. That’s light and relatable. Perfect.
</think>

The Alpine Explorer Tent boasts a detachable partition for privacy 🏕️  
Multiple mesh windows and adjustable vents keep the airflow just right 🌬


Streaming

In [31]:
%%time
response = llm.chat.completions.create(
    model=model_path,
    messages=[
        {"role": "saystem", "content": system_message},
        {"role": "user", "content": user_message},
    ],
    temperature=0.7,
    max_tokens=200,
    stream=True,  # Stream the response
)

print("Streaming response:")
for chunk in response:
    delta = chunk.choices[0].delta
    if hasattr(delta, "content"):
        print(delta.content, end="", flush=True)

Streaming response:
<think>
Okay, so I need to figure out the features of the Alpine Explorer Tent based on the given context. Let me start by reading the context again carefully.

The context says: The Alpine Explorer Tent features a detachable partition to ensure privacy, numerous mesh windows and adjustable vents for ventilation, and a waterproof design. It also includes a built-in gear loft for storing outdoor essentials. In short, it offers a harmonious blend of privacy, comfort, and convenience, making it a second home in nature!

Hmm, the question is asking for the features, so I need to list out each feature mentioned here. Let me parse each sentence.

First, "detachable partition to ensure privacy" – that's one feature: a detachable partition for privacy. Next, "numerous mesh windows and adjustable vents for ventilation" – so two parts here: mesh windows and adjustable vents, both for ventilation. Then, "waterproof design" – another feature. Also, "built-in gear loftNoneCPU ti

Another method

In [32]:
import requests

completions_url = os.path.join(endpoint_results.scoring_uri, "v1/completions")
headers = {"Content-Type": "application/json", "Authorization": f"Bearer {primary_key}"}
data = {
    "model": model_path,
    "prompt": "San Francisco is a ",
    "max_tokens": 200,
    "temperature": 0.7,
}

response = requests.post(completions_url, headers=headers, json=data)
print(response.json())

{'choices': [{'text': '46-square-mile peninsula surrounded by water on three sides. The city is home to some of the most expensive real estate in the country and is known for its steep hills. The hills are a result of the city’s location on the San Andreas Fault, which runs through California. The fault is responsible for the city’s earthquakes, which have caused the hills to form over time.\n\nSan Francisco has 43 hills, according to the city’s official website. The city is located on a peninsula that is surrounded by the Pacific Ocean, San Francisco Bay, and Golden Gate. The steepness of the hills is due in part to the city’s location on the San Andreas Fault. Because San Francisco is so hilly, many of its streets are extremely steep. It was built on seven hills, similar to Rome. The city’s highest peak is Mount Davidson, which is located at 927 feet above sea level. The San Andreas Fault runs through the city and is responsible for earthquakes.\n\n', 'index': 0, 'logprobs': None, 'f

### 4.2. LLM latency/throughput simple benchmarking


In [33]:
import numpy as np
from time import perf_counter


def simple_llm_benchmark(
    llm: OpenAI,
    messages: list,
    model_path: str = "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B",
    num_warmups: int = 1,
    num_infers: int = 5,
    **params: dict,
) -> dict:

    print("=== Measuring latency ===")
    print(f"model_path={model_path}, num_infers={num_infers}, params={params}")

    latencies = []
    # Warm up
    for _ in range(num_warmups):
        response = llm.chat.completions.create(
            model=model_path,
            messages=messages,
            **params,
        )
    print("=== Warmup done. Start Benchmarking... ===")
    begin = time.time()
    # Timed run
    for curr_infer in range(num_infers):
        start_time = perf_counter()
        if (curr_infer % 5) == 0:
            print(f"Inferring {curr_infer}th...")
        response = llm.chat.completions.create(
            model=model_path,
            messages=messages,
            **params,
        )
        latency = perf_counter() - start_time
        latencies.append(latency)
    end = time.time()

    # Compute run statistics
    duration = end - begin
    time_avg_sec = np.mean(latencies)
    time_std_sec = np.std(latencies)
    time_p95_sec = np.percentile(latencies, 95)
    time_p99_sec = np.percentile(latencies, 99)

    # Metrics
    metrics = {
        "duration": duration,
        "avg_sec": time_avg_sec,
        "std_sec": time_std_sec,
        "p95_sec": time_p95_sec,
        "p99_sec": time_p99_sec,
    }

    return metrics

In [34]:
messages = [
    {"role": "system", "content": system_message},
    {"role": "user", "content": user_message},
]
params = {
    "max_tokens": 100,
    "temperature": 0.5,
}

model_path = "/models/DeepSeek-R1-UD-IQ1_M"

metrics = simple_llm_benchmark(
    llm, messages, model_path=model_path, num_warmups=1, num_infers=10, **params
)

=== Measuring latency ===
model_path=/models/deepseek-adapter/DeepSeek-R1-Distill-Qwen-1.5B, num_infers=10, params={'max_tokens': 100, 'temperature': 0.5}
=== Warmup done. Start Benchmarking... ===
Inferring 0th...
Inferring 5th...


In [35]:
import pprint

pprint.pprint(metrics)

{'avg_sec': 7.823659540410153,
 'duration': 78.23661303520203,
 'p95_sec': 7.972206991264829,
 'p99_sec': 7.999277103060158,
 'std_sec': 0.15908935995443393}


## Clean up


In [36]:
ml_client.online_endpoints.begin_delete(azure_endpoint_name)

<azure.core.polling._poller.LROPoller at 0x7f417cd79390>

...............................................................................