## Prometheus Client

In [2]:
import asyncio
from typing import Any, Dict, List

import httpx


class PrometheusClient:
    """Prometheus client to interact with the Prometheus server."""

    _url: str = None
    _timeout: float = None
    _aclient: httpx.AsyncClient = None

    def __init__(self, url: str, timeout: float = 60.0):
        """
        Prometheus client to interact with the Prometheus server.

        Args:
            url (`str`): URL of the Prometheus server.
            timeout (`float`): Timeout for the HTTP requests.
        """

        self.url = url
        self.timeout = timeout

        self._htimeout = httpx.Timeout(self._timeout)
        self._aclient = httpx.AsyncClient(timeout=self._htimeout)

    @property
    def url(self):
        """URL of the Prometheus server."""

        return self._url

    @url.setter
    def url(self, value):
        self._url = value

    @property
    def timeout(self):
        """Timeout for the HTTP requests."""

        return self._timeout

    @timeout.setter
    def timeout(self, value):
        self._timeout = value
        self._htimeout = httpx.Timeout(value)
        self._aclient = httpx.AsyncClient(timeout=self._htimeout)

    async def get_targets(self):
        """Get all of the targets that Prometheus is scraping.

        Returns:
            targets (`Dict`): Targets that Prometheus is scraping.
        """

        response = await self._aclient.get(f"{self.url}/api/v1/targets")
        response.raise_for_status()

        return response.json()

    async def execute_query(self, query):
        """Execute a query on the Prometheus server.

        Args:
            query (`str`): PromQL query to execute

        Returns:
            response (`Dict`): Query result from the Prometheus server
        """

        response = await self._aclient.get(f"{self.url}/api/v1/query", params={"query": query})
        response.raise_for_status()

        return response.json()

    async def execute_multiple_queries(self, queries: List[str]) -> Dict[str, Dict]:
        """Execute multiple queries on the Prometheus server.

        Args:
            queries (`List[str]`): List of PromQL queries to execute

        Returns:
            queries_response (`Dict[str, Dict]`): Dictionary of query results from the Prometheus server
        """

        queries_response: Dict[str, Dict] = {}

        futures = await asyncio.gather(*[self.execute_query(query) for query in queries])

        for i, query in enumerate(queries):
            queries_response[query] = futures[i]

        return queries_response

## Main

In [3]:
scheme = "http://"
host = "10.20.1.93"
port = 30090

prometheus_client = PrometheusClient(f"{scheme}{host}:{port}")

### Get Prometheus Targets

In [4]:
import json

targets = await prometheus_client.get_targets()
# with open("targets.json", "w") as f:
#     f.write(json.dumps(targets, indent=4))
#     f.close()

### Execute Multiple Prometheus Queries

In [5]:
node_gpu_info = await prometheus_client.execute_multiple_queries([
    "DCGM_FI_DEV_FB_FREE",
    "DCGM_FI_DEV_FB_USED",
    "DCGM_FI_DEV_GPU_TEMP",
    "DCGM_FI_DEV_GPU_UTIL",
    "DCGM_FI_DEV_POWER_USAGE"
])

# with open("dcgm_gpu_info.json", "w") as f:
#     f.write(json.dumps(node_gpu_info, indent=4))
#     f.close()

## Ollama Client

In [6]:
from llm.ollama import OllamaClient

ollama_client = OllamaClient("http://10.244.0.84:8000")
models = await ollama_client.list()
print(models.model_dump_json(indent=4))

ModuleNotFoundError: No module named 'llm'

## GPU Dispatcher

In [34]:
import math
import re

from const.format import iB
from gpu.dispatcher.types import GPU, GPUNode, GPUNodeList
from llm.ollama import ListResponse


class GPUDispatcher:

    _instance = None

    def __new__(cls, *args, **kwargs):
        '''
        GPUDispatcher Singleton Instance
        '''

        if cls._instance is None:
            cls._instance = super().__new__(cls)

        return cls._instance

    _node_gpu_info: Dict[str, Dict] = {}
    """Node GPU Information from Prometheus"""

    _formatted_nodes_gpu_info: GPUNodeList = GPUNodeList([])
    """Formatted Node GPU Information from Prometheus"""

    def __init__(self, node_gpu_info: Dict[str, Dict]):
        '''Initializes the GPUDispatcher to dispatch the GPU resources.

        Args:
            node_gpu_info (`Dict[str, Dict]`): Node GPU Informaton from Prometheus
        '''

        self.node_gpu_info = node_gpu_info

    @property
    def node_gpu_info(self):
        """Node GPU Information from Prometheus
        """

        return self._node_gpu_info

    @node_gpu_info.setter
    def node_gpu_info(self, value: Dict[str, Dict]):
        self._node_gpu_info = value

    @property
    def formatted_nodes_gpu_info(self):
        """Formatted Node GPU Information from Prometheus
        """

        return self._formatted_nodes_gpu_info

    @formatted_nodes_gpu_info.setter
    def formatted_nodes_gpu_info(self, value: GPUNodeList):
        self._formatted_nodes_gpu_info = value

    def prometheus_metrics_name_mapping(self, metrics_name: str) -> str:
        """Mapping Prometheus metrics name to the GPU metrics name.

        Args:
            metrics_name (`str`): Prometheus metrics name

        Returns:
            gpu_metrics_name (`str`): GPU metrics name
        """

        match metrics_name:
            case "DCGM_FI_DEV_FB_FREE": return "free_memory"
            case "DCGM_FI_DEV_FB_USED": return "used_memory"
            case "DCGM_FI_DEV_GPU_TEMP": return "temperature"
            case "DCGM_FI_DEV_GPU_UTIL": return "memory_usage"
            case "DCGM_FI_DEV_POWER_USAGE": return "power_usage"

    def get_formatted_nodes_gpu_info(self) -> GPUNodeList:
        """Get the formatted node GPU information from Prometheus.

        Returns:
            formatted_nodes_gpu_info (`GPUNodeList`): Formatted node GPU information from Prometheus
        """

        gpu_node_list: GPUNodeList = GPUNodeList([])

        for query, response in self.node_gpu_info.items():
            for node in response["data"]["result"]:
                node_name = node["metric"]["kubernetes_node"]
                value = node["value"][1]

                existing_node = next(
                    (node for node in gpu_node_list.gpu_nodes if node.node_name == node_name), None)
                if existing_node:
                    gpu_node = existing_node
                else:
                    gpu_node = GPUNode(node_name, [])
                    gpu_node_list.gpu_nodes.append(gpu_node)

                gpu_index = node["metric"]["gpu"]
                gpu_uuid = node["metric"]["UUID"]
                gpu_name = node["metric"]["modelName"]

                gpu_info = next(
                    (
                        gpu for gpu in gpu_node.gpus if gpu.index == f"cuda:{gpu_index}"
                    ), None
                )

                if not gpu_info:
                    gpu_info = GPU(
                        index=f"cuda:{gpu_index}",
                        uuid=gpu_uuid,
                        name=gpu_name,
                        free_memory=None,
                        used_memory=None,
                        temperature=None,
                        memory_usage=None,
                        power_usage=None
                    )

                    gpu_node.gpus.append(gpu_info)

                for gpu_info in gpu_node.gpus:
                    if gpu_info.index == f"cuda:{gpu_index}":
                        gpu_info.__setattr__(
                            self.prometheus_metrics_name_mapping(query),
                            int(float(value))
                        )

        self._formatted_nodes_gpu_info = gpu_node_list

        return gpu_node_list

    def get_available_gpus(
        self,
        model_name: str,
        ollama_models: ListResponse,
        use_multiple_gpus: bool = False
    ) -> GPUNodeList:
        """Get the available GPUs based on the free memory and the estimated VRAM.

        Args:
            model_name (`str`): Model name for LLM inference
            ollama_models (`ListResponse`): Ollama Models
            use_multiple_gpus (`bool`): Use multiple GPUs or not

        Returns:
            available_gpus (`GPUNodeList]`): Available GPUs
        """

        available_gpus: GPUNodeList = GPUNodeList([])

        # 估算 LLM 推理所需的 VRAM 大小
        estimate_vram = self._calc_inference_estimate_vram(
            model_name, ollama_models
        )

        for gpu_node in self.formatted_nodes_gpu_info.gpu_nodes:
            print(f"Node: {gpu_node.node_name}, GPU Count: {len(gpu_node.gpus)}")
            if len(gpu_node.gpus) > 1:
                if use_multiple_gpus:
                    total_vram = sum([gpu.free_memory for gpu in gpu_node.gpus])

                    self._filter_available_gpus(
                        total_vram, estimate_vram, available_gpus, gpu_node.node_name, gpu_node.gpus
                    )
                else:
                    for gpu in gpu_node.gpus:
                        self._filter_available_gpus(
                            gpu.free_memory, estimate_vram, available_gpus, gpu_node.node_name, gpu_node.gpus
                        )
            else:
                for gpu in gpu_node.gpus:
                    self._filter_available_gpus(
                        gpu.free_memory, estimate_vram, available_gpus, gpu_node.node_name, gpu_node.gpus
                    )

        return available_gpus

    def _filter_available_gpus(
        self,
        free_memory: int,
        estimate_vram: int,
        available_gpus: GPUNodeList,
        node: str,
        gpus: List[GPU]
    ):
        """Filter the available GPUs based on the free memory and the estimated VRAM.

        Args:
            free_memory (`int`): Free memory of the GPU.
            estimate_vram (`int`): Estimated VRAM required for the LLM inference.
            available_gpus (`GPUNodeList`): List to store the available GPUs.
            node (`str`): Node name.
            gpus (`List[GPUInfo]`): List of GPUs information.
        """

        if free_memory > estimate_vram:
            existing_node = next(
                (gpu_node for gpu_node in available_gpus.gpu_nodes if gpu_node.node_name == node), None
            )

            if not existing_node:
                existing_node = GPUNode(node_name=node, gpus=[])
                available_gpus.gpu_nodes.append(existing_node)

            existing_node.gpus.extend(gpus)

    def _calc_inference_estimate_vram(self, model_name: str, ollama_models: ListResponse) -> int:
        '''
        根據模型的 `參數量` 與 `量化等級` 計算進行 LLM 推理所需的預估 GPU 記憶體

        Args:
            model_name (`str`): 要使用 Ollama 進行 LLM Inference 的模型名稱
            ollama_models (`ListResponse`): Ollama Models

        Returns:
            estimate_vram (`int`): 預估 GPU 記憶體 (MiB)
        '''

        def re_parameter_size(value: str) -> float:
            '''
            對 Parameter Size 進行正規化

            Args:
                value (`str`): Parameter Size 字串，例如：`"1.2B"`、`"8.0B"`

            Returns:
                number (`float`): 正規化後的數值
            '''

            match = re.match(r"(\d+(\.\d+)?)([KMB])", value)
            if not match:
                raise ValueError("Invalid format")

            number = float(match.group(1))

            return number

        def re_quantization_level(value: str) -> int:
            '''
            對 Quantization Level 進行正規化

            Args:
                value (`str`): Quantization Level 字串，例如：`"Q8_0"`、`"Q4_K_M"`、`"Q4_0"`、`"F16"`

            Returns:
                number (`int`): 正規化後的數值
            '''

            match = re.search(r'\d+', value)
            if not match:
                raise ValueError("No number found in the string")

            return int(match.group(0))

        for model in ollama_models.models:
            if model.model == model_name:
                model_parameter_size = model.details.parameter_size
                model_quantization_level = model.details.quantization_level

                parameter_size = re_parameter_size(model_parameter_size)
                quantization_level = re_quantization_level(
                    model_quantization_level)

                # 計算公式參考：https://www.substratus.ai/blog/calculating-gpu-memory-for-llm
                # `result = ((parameter_size * 4 / (32 / quantization_level)) * 1.2) * iB`
                # `result` 為估計的 VRAM 使用量 (MiB)
                # `parameter_size` 為模型參數量 (B)
                # `quantization_level` 為模型量化等級
                # `iB` 為 1024，用來將 GiB 轉換成 MiB
                # `1.2` 多計算 20% 的 GPU 記憶體，避免記憶體不足

                estimate_vram = math.ceil(
                    ((parameter_size * 4 / (32 / quantization_level)) * 1.2) * iB
                )

                print(
                    f"Model: {model_name}, Estimate VRAM: {estimate_vram} MiB"
                )

                return estimate_vram
            else:
                continue

In [35]:
gpu_dispatcher = GPUDispatcher(node_gpu_info)
nodes_gpu_info = gpu_dispatcher.get_formatted_nodes_gpu_info()
print(node_gpu_info)

print("-" * 80)

model_name = "gemma2:2b"
available_gpus = gpu_dispatcher.get_available_gpus(
    model_name=model_name,
    ollama_models=models,
    use_multiple_gpus=True
)
print(available_gpus)

{'DCGM_FI_DEV_FB_FREE': {'status': 'success', 'data': {'resultType': 'vector', 'result': [{'metric': {'DCGM_FI_DRIVER_VERSION': '550.54.15', 'Hostname': 'ubuntu-d830mt', 'UUID': 'GPU-956ed35b-d92b-df08-0aa1-9a542e58350b', '__name__': 'DCGM_FI_DEV_FB_FREE', 'device': 'nvidia0', 'gpu': '0', 'instance': '10.244.0.80:9400', 'job': 'gpu-metrics', 'kubernetes_node': 'ubuntu-d830mt', 'modelName': 'NVIDIA GeForce RTX 3070 Ti', 'pci_bus_id': '00000000:01:00.0'}, 'value': [1737101573.721, '7955']}, {'metric': {'DCGM_FI_DRIVER_VERSION': '535.129.03', 'Hostname': 'ubuntu-ms-7d98', 'UUID': 'GPU-67b02881-4c95-3576-6996-9a3f6c30abdc', '__name__': 'DCGM_FI_DEV_FB_FREE', 'device': 'nvidia0', 'gpu': '0', 'instance': '10.244.1.14:9400', 'job': 'gpu-metrics', 'kubernetes_node': 'ubuntu-ms-7d98', 'modelName': 'NVIDIA GeForce RTX 4070', 'pci_bus_id': '00000000:01:00.0'}, 'value': [1737101573.721, '11996']}, {'metric': {'DCGM_FI_DRIVER_VERSION': '535.129.03', 'Hostname': 'ubuntu-ms-7d98', 'UUID': 'GPU-90416d

In [8]:
from gpu.dispatcher.types import GPUNodeList, GPUNode, GPU

gpu_node_list: GPUNodeList = GPUNodeList([])

for query, response in gpu_dispatcher.node_gpu_info.items():
    for node in response["data"]["result"]:
        node_name = node["metric"]["kubernetes_node"]
        value = node["value"][1]

        existing_node = next(
            (node for node in gpu_node_list.gpu_nodes if node.node_name == node_name), None)
        if existing_node:
            gpu_node = existing_node
        else:
            gpu_node = GPUNode(node_name, [])
            gpu_node_list.gpu_nodes.append(gpu_node)

        gpu_index = node["metric"]["gpu"]
        gpu_uuid = node["metric"]["UUID"]
        gpu_name = node["metric"]["modelName"]

        gpu_info = next(
            (
                gpu for gpu in gpu_node.gpus if gpu.index == f"cuda:{gpu_index}"
            ), None
        )

        if not gpu_info:
            gpu_info = GPU(
                index=f"cuda:{gpu_index}",
                uuid=gpu_uuid,
                name=gpu_name,
                free_memory=None,
                used_memory=None,
                temperature=None,
                memory_usage=None,
                power_usage=None
            )

            gpu_node.gpus.append(gpu_info)

        for gpu_info in gpu_node.gpus:
            if gpu_info.index == f"cuda:{gpu_index}":
                gpu_info.__setattr__(
                    gpu_dispatcher.metrics_name_mapping(query), int(float(value)))

print(gpu_node_list)

GPUNodeList(gpu_nodes=[GPUNode(node_name='ubuntu-d830mt', gpu_infos=[GPUInfo(index='cuda:0', uuid='GPU-956ed35b-d92b-df08-0aa1-9a542e58350b', name='NVIDIA GeForce RTX 3070 Ti', free_memory=7955, used_memory=11, memory_usage=0, temperature=48, power_usage=17)]), GPUNode(node_name='ubuntu-ms-7d98', gpu_infos=[GPUInfo(index='cuda:0', uuid='GPU-67b02881-4c95-3576-6996-9a3f6c30abdc', name='NVIDIA GeForce RTX 4070', free_memory=11996, used_memory=12, memory_usage=0, temperature=40, power_usage=5), GPUInfo(index='cuda:1', uuid='GPU-90416dcd-a5d6-8f79-48d9-1ead45f5c327', name='NVIDIA GeForce RTX 4070', free_memory=11996, used_memory=12, memory_usage=0, temperature=39, power_usage=3)])])


In [9]:
from ollama import AsyncClient, ListResponse


class OllamaClient:

    _aclient: AsyncClient = None

    def __init__(self, host: str):
        self._aclient = AsyncClient(host)

    async def list(self) -> ListResponse:
        """List Ollama Models

        Returns:
            models (`ListResponse`): List of Ollama Models
        """

        return await self._aclient.list()

{
    "models": [
        {
            "model": "gemma2:2b",
            "modified_at": "2025-01-02T15:39:30.242118Z",
            "digest": "8ccf136fdd5298f3ffe2d69862750ea7fb56555fa4d5b18c04e3fa4d82ee09d7",
            "size": 1629518495,
            "details": {
                "parent_model": "",
                "format": "gguf",
                "family": "gemma2",
                "families": [
                    "gemma2"
                ],
                "parameter_size": "2.6B",
                "quantization_level": "Q4_0"
            }
        },
        {
            "model": "gemma2-2b-builtin:latest",
            "modified_at": "2025-01-02T15:39:30.278117Z",
            "digest": "8ccf136fdd5298f3ffe2d69862750ea7fb56555fa4d5b18c04e3fa4d82ee09d7",
            "size": 1629518495,
            "details": {
                "parent_model": "",
                "format": "gguf",
                "family": "gemma2",
                "families": [
                    "gemma2"
        

In [11]:
import math
import re


def _calc_inference_estimate_vram(model_name: str, ollama_models: ListResponse) -> int:
    '''
    根據模型的 `參數量` 與 `量化等級` 計算進行 LLM 推理所需的預估 GPU 記憶體

    Args:
        model_name (`str`): 要使用 Ollama 進行 LLM Inference 的模型名稱
        ollama_models (`ListResponse`): Ollama Models

    Returns:
        estimate_vram (`int`): 預估 GPU 記憶體 (MiB)
    '''

    def re_parameter_size(value: str) -> float:
        '''
        對 Parameter Size 進行正規化

        Args:
            value (`str`): Parameter Size 字串，例如：`"1.2B"`、`"8.0B"`

        Returns:
            number (`float`): 正規化後的數值
        '''

        match = re.match(r"(\d+(\.\d+)?)([KMB])", value)
        if not match:
            raise ValueError("Invalid format")

        number = float(match.group(1))

        return number

    def re_quantization_level(value: str) -> int:
        '''
        對 Quantization Level 進行正規化

        Args:
            value (`str`): Quantization Level 字串，例如：`"Q8_0"`、`"Q4_K_M"`、`"Q4_0"`、`"F16"`

        Returns:
            number (`int`): 正規化後的數值
        '''

        match = re.search(r'\d+', value)
        if not match:
            raise ValueError("No number found in the string")

        return int(match.group(0))

    for model in ollama_models.models:
        if model.model == model_name:
            model_parameter_size = model.details.parameter_size
            model_quantization_level = model.details.quantization_level

            parameter_size = re_parameter_size(model_parameter_size)
            quantization_level = re_quantization_level(
                model_quantization_level)

            # 計算公式參考：https://www.substratus.ai/blog/calculating-gpu-memory-for-llm
            # `result = ((parameter_size * 4 / (32 / quantization_level)) * 1.2) * iB`
            # `result` 為估計的 VRAM 使用量 (MiB)
            # `parameter_size` 為模型參數量 (B)
            # `quantization_level` 為模型量化等級
            # `iB` 為 1024，用來將 GiB 轉換成 MiB
            # `1.2` 多計算 20% 的 GPU 記憶體，避免記憶體不足

            estimate_vram = math.ceil(
                ((parameter_size * 4 / (32 / quantization_level)) * 1.2) * 1024
            )

            print(
                f"Model: {model_name}, Estimate VRAM: {estimate_vram} MiB"
            )

            return estimate_vram

In [12]:
estimated_vram = _calc_inference_estimate_vram("gemma2:2b", models)
print(estimated_vram)

Model: gemma2:2b, Estimate VRAM: 1598 MiB
1598


In [3]:
import re
from typing import List

def convert_ollama_model_names(model_names: List[str]) -> List[str]:
    """
    將Ollama模型名稱轉換為冒號分隔的格式。
    
    參數:
        model_names (List[str]): 要轉換的模型名稱列表
    
    返回:
        List[str]: 轉換後的模型名稱
    
    拋出:
        ValueError: 如果輸入不是列表
    """
    def convert_single_model_name(name: str) -> str:
        # 更精確的正則表達式，支持更複雜的模型名稱模式
        pattern = r'^([\w\d.]+)(?:[-.](\d+[a-z]?(?:-\w+)*))?(?:[:.](\d+[a-z]?(?:-\w+)*))?'
        match = re.match(pattern, name)
        
        if match:
            model_base = match.group(1)
            version_part1 = match.group(2)
            version_part2 = match.group(3)
            
            # 根據不同的匹配情況進行轉換
            if version_part1 and not version_part2:
                # 原始模式：model-version
                return f"{model_base}:{version_part1}"
            elif version_part2:
                # 已包含冒號的模式：model:version
                return name
            else:
                # 無版本的情況
                return name
        
        # 如果無法進行轉換，則返回原始名稱
        return name
    
    # 輸入驗證
    if not isinstance(model_names, list):
        raise ValueError("輸入必須是模型名稱列表")
    
    # 轉換每個模型名稱
    converted_names = [convert_single_model_name(name) for name in model_names]
    
    return converted_names

# 示例使用和測試
def main():
    input_models = [
        "gemma2-2b", 
        "gemma2-9b", 
        "llama3.1-8b", 
        "llama3.2-3b", 
        "llama3.3-70b-instruct-fp16",
        "llama3.2:1b"  # 新增測試用例
    ]
    
    try:
        output_models = convert_ollama_model_names(input_models)
        print("轉換後的模型名稱：")
        for orig, converted in zip(input_models, output_models):
            print(f"{orig} → {converted}")
    except ValueError as e:
        print(f"轉換錯誤：{e}")

if __name__ == "__main__":
    main()

轉換後的模型名稱：
gemma2-2b → gemma2:2b
gemma2-9b → gemma2:9b
llama3.1-8b → llama3.1:8b
llama3.2-3b → llama3.2:3b
llama3.3-70b-instruct-fp16 → llama3.3:70b-instruct-fp16
llama3.2:1b → llama3.2:1b


In [4]:
from llm.models import  OllamaBuiltinModel

def get_ollama_model(model_name: str) -> OllamaBuiltinModel:
    
    try:
        return OllamaBuiltinModel(model_name)
    except ValueError as e:
        print(f"無法獲取模型：{e}")
        
model = get_ollama_model("gemma2:2b")
print(model)

OllamaBuiltinModel.Gemma2_2B


In [5]:
import json
from typing import Any, Dict

import yaml

def parse_model_yaml(model_yaml_file_path: str) -> Dict[str, Any]:
    with open(model_yaml_file_path, 'r') as f:
        model_yaml: Dict[str, Any] = yaml.load(f, Loader=yaml.SafeLoader)
        return model_yaml
    
gemma2_2b_yaml = parse_model_yaml("k8s/deploy/kubeai/gemma2-2b-builtin.yaml")
print(json.dumps(gemma2_2b_yaml, indent=4))

gemma2_9b_yaml = parse_model_yaml("k8s/deploy/kubeai/gemma2-9b-builtin.yaml")
print(json.dumps(gemma2_9b_yaml, indent=4))

llama3_1_8b_yaml = parse_model_yaml("k8s/deploy/kubeai/llama3.1-8b-builtin.yaml")
print(json.dumps(llama3_1_8b_yaml, indent=4))

llama3_2_3b_yaml = parse_model_yaml("k8s/deploy/kubeai/llama3.2-3b-builtin.yaml")
print(json.dumps(llama3_2_3b_yaml, indent=4))

{
    "apiVersion": "kubeai.org/v1",
    "kind": "Model",
    "metadata": {
        "name": "gemma2-2b"
    },
    "spec": {
        "features": [
            "TextGeneration"
        ],
        "image": "leoho0722/ollama-builtin-gemma2-2b:0.1.0",
        "url": "ollama://gemma2:2b",
        "engine": "OLlama",
        "replicas": 1,
        "minReplicas": 1,
        "targetRequests": 50,
        "scaleDownDelaySeconds": 30,
        "env": {
            "OLLAMA_KEEP_ALIVE": "0s"
        },
        "resourceProfile": "cpu:2"
    }
}
{
    "apiVersion": "kubeai.org/v1",
    "kind": "Model",
    "metadata": {
        "name": "gemma2-9b"
    },
    "spec": {
        "features": [
            "TextGeneration"
        ],
        "image": "leoho0722/ollama-builtin-gemma2-9b:0.1.0",
        "url": "ollama://gemma2:9b",
        "engine": "OLlama",
        "replicas": 1,
        "minReplicas": 1,
        "targetRequests": 50,
        "scaleDownDelaySeconds": 30,
        "env": {
            "OLL

In [None]:
async def get_available_gpus(
    self,
    model_name: str,
    ollama_models: ListResponse,
    use_multiple_gpus: bool = True  # 預設改為 True 以支援多 GPU
) -> GPUNodeList:
    available_gpus: GPUNodeList = GPUNodeList()
    estimate_vram = _calc_inference_estimate_vram(model_name, ollama_models)
    
    for gpu_node in self.formatted_nodes_gpu_info.gpu_nodes:
        # 依照 VRAM 大小排序 GPU (從小到大)
        sorted_gpus = sorted(gpu_node.gpus, key=lambda x: x.free_memory)
        total_node_vram = sum(gpu.free_memory for gpu in gpu_node.gpus)
        
        print(f"Node: {gpu_node.node_name}, Total VRAM: {total_node_vram}, Required VRAM: {estimate_vram}")
        
        # 檢查節點總 VRAM 是否足夠
        if total_node_vram >= estimate_vram:
            required_gpus = []
            current_vram = 0
            
            # 從 VRAM 較小的 GPU 開始選擇
            for gpu in sorted_gpus:
                required_gpus.append(gpu)
                current_vram += gpu.free_memory
                
                if current_vram >= estimate_vram:
                    # 找到足夠的 GPU 組合
                    existing_node = GPUNode(node_name=gpu_node.node_name, gpus=[])
                    existing_node.gpus.extend(required_gpus)
                    available_gpus.gpu_nodes.append(existing_node)
                    break
        
        print(
            f"Node {gpu_node.node_name}: Selected {len(available_gpus.gpu_nodes)} GPU(s)"
        )

    return available_gpus

def _calc_inference_estimate_vram(self, model_name: str, ollama_models: ListResponse) -> int:
    def re_parameter_size(value: str) -> float:
        '''
        對 Parameter Size 進行正規化

        Args:
            value (`str`): Parameter Size 字串，例如：`"1.2B"`、`"8.0B"`

        Returns:
            number (`float`): 正規化後的數值
        '''

        match = re.match(r"(\d+(\.\d+)?)([KMB])", value)
        if not match:
            raise ValueError("Invalid format")

        number = float(match.group(1))

        return number

    def re_quantization_level(value: str) -> int:
        '''
        對 Quantization Level 進行正規化

        Args:
            value (`str`): Quantization Level 字串，例如：`"Q8_0"`、`"Q4_K_M"`、`"Q4_0"`、`"F16"`

        Returns:
            number (`int`): 正規化後的數值
        '''

        match = re.search(r'\d+', value)
        if not match:
            raise ValueError("No number found in the string")

        return int(match.group(0))
        
    for model in ollama_models.models:
        if model.model == model_name:
            parameter_size = re_parameter_size(model.details.parameter_size)
            quantization_level = re_quantization_level(model.details.quantization_level)

            # 計算基礎的 VRAM 需求
            base_vram = math.ceil(
                ((parameter_size * 4 / (32 / quantization_level))) * iB
            )
            
            # 加上 20% 的緩衝
            estimate_vram = math.ceil(base_vram * 1.2)
            
            self.logger.info(
                f"Model: {model_name}, Base VRAM: {base_vram} MiB, Final Estimate: {estimate_vram} MiB"
            )
            
            return estimate_vram

In [1]:
def convert_to_kubeai_gpu_resources_name(model: str, count: int) -> str:
    """Convert the selected GPU resources to the KubeAI GPU resources name.

    Args:
        selected_gpu (`GPUNode`): Selected GPU resources

    Returns:
        kubeai_gpu_resources_name (`str`): KubeAI GPU resources name
    """
    try:
        selected_gpu_model = model
        selected_gpu_count = count
        
        # GPU VRAM 映射表
        gpu_vram_map = {
            "3070 Ti": "8",
            "3080 Ti": "12",
            "3090": "24",
            "4070": "12",
            "4080": "16",
            "4090": "24",
        }
        
        # 解析 GPU 型號
        model_number = None
        vram_size = None
        
        for key in gpu_vram_map.keys():
            if key in selected_gpu_model:
                model_number = key.replace(" ", "").lower()
                vram_size = gpu_vram_map[key]
                break
        
        if not model_number or not vram_size:
            raise ValueError(f"Unsupported GPU model: {selected_gpu_model}")
            
        # 格式化輸出
        kubeai_gpu_resources_name = f"nvidia-gpu-{model_number}-{vram_size}gb:{selected_gpu_count}"
        
        print(
            f"Selected GPU Model: {selected_gpu_model}, "
            f"Count: {selected_gpu_count}, "
            f"Resource Name: {kubeai_gpu_resources_name}"
        )
        
        return kubeai_gpu_resources_name
        
    except Exception as e:
        print(f"Error converting GPU resources name: {str(e)}")
        raise
    
testing_inputs = [
    {
        "model": "NVIDIA GeForce RTX 3070 Ti", 
        "count": 1
    },
    {
        "model": "NVIDIA GeForce RTX 3080 Ti",
        "count": 2
    },
    {
        "model": "NVIDIA GeForce RTX 4070",
        "count": 2
    },
    {
        "model": "NVIDIA GeForce RTX 4090",
        "count": 1
    }
]

for input_data in testing_inputs:
    convert_to_kubeai_gpu_resources_name(input_data["model"], input_data["count"])

Selected GPU Model: NVIDIA GeForce RTX 3070 Ti, Count: 1, Resource Name: nvidia-gpu-3070ti-8gb:1
Selected GPU Model: NVIDIA GeForce RTX 3080 Ti, Count: 2, Resource Name: nvidia-gpu-3080ti-12gb:2
Selected GPU Model: NVIDIA GeForce RTX 4070, Count: 2, Resource Name: nvidia-gpu-4070-12gb:2
Selected GPU Model: NVIDIA GeForce RTX 4090, Count: 1, Resource Name: nvidia-gpu-4090-24gb:1


In [10]:
import json
import yaml

with open("backend/gpu/dispatcher/gpu_models.yaml") as f:
    gpu_models: List[Dict[str, Any]] = yaml.safe_load(f)
    f.close()
    
    for gpu in gpu_models:
        model = gpu["model"]
        vram = gpu["vram"]
        print(f"Model: {model}, VRAM: {vram}")
        
        model = model.split("RTX")[1].strip()
        print(f"Model Number: {model}")
    

Model: NVIDIA Geforce RTX 3070 Ti, VRAM: 8
Model Number: 3070 Ti
Model: NVIDIA Geforce RTX 3080 Ti, VRAM: 12
Model Number: 3080 Ti
Model: NVIDIA Geforce RTX 4070, VRAM: 12
Model Number: 4070
Model: NVIDIA Geforce RTX 4080 SUPER, VRAM: 16
Model Number: 4080 SUPER
Model: NVIDIA Geforce RTX 4090, VRAM: 24
Model Number: 4090
