huggingface · danieldk · Sep 17, 2024 · Sep 16, 2024 · Sep 17, 2024 · Sep 17, 2024
diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml
@@ -45,7 +45,7 @@ jobs:
                 export dockerfile="Dockerfile"
                 export label_extension=""
                 export docker_devices=""
-                export runs_on="aws-g6-12xlarge-plus-priv"
+                export runs_on="aws-g6-12xl-plus-priv-cache"
                 export platform=""
                 ;;
             rocm)

diff --git a/Cargo.lock b/Cargo.lock
diff --git a/Dockerfile b/Dockerfile
@@ -258,7 +258,7 @@ COPY server/Makefile server/Makefile
 RUN cd server && \
     make gen-server && \
     pip install -r requirements_cuda.txt && \
-    pip install ".[bnb, accelerate, marlin, quantize, peft, outlines]" --no-cache-dir && \
+    pip install ".[bnb, accelerate, marlin, moe, quantize, peft, outlines]" --no-cache-dir && \
     pip install nvidia-nccl-cu12==2.22.3
 
 ENV LD_PRELOAD=/opt/conda/lib/python3.11/site-packages/nvidia/nccl/lib/libnccl.so.2

diff --git a/integration-tests/models/__snapshots__/test_flash_mixtral/test_flash_mixtral_all_params.json b/integration-tests/models/__snapshots__/test_flash_mixtral/test_flash_mixtral_all_params.json
@@ -16,17 +16,17 @@
       },
       {
         "id": 28804,
-        "logprob": -7.4335938,
+        "logprob": -7.4375,
         "text": "?"
       },
       {
         "id": 13,
-        "logprob": -0.8017578,
+        "logprob": -0.8046875,
         "text": "\n"
       },
       {
         "id": 13,
-        "logprob": -0.32958984,
+        "logprob": -0.33032227,
         "text": "\n"
       }
     ],
@@ -64,7 +64,7 @@
       },
       {
         "id": 369,
-        "logprob": -0.06585693,
+        "logprob": 0.0,
         "special": false,
         "text": " that"
       },

diff --git a/nix/server.nix b/nix/server.nix
@@ -21,6 +21,7 @@
   loguru,
   mamba-ssm,
   marlin-kernels,
+  moe-kernels,
   opentelemetry-api,
   opentelemetry-exporter-otlp,
   opentelemetry-instrumentation-grpc,
@@ -88,6 +89,7 @@ buildPythonPackage {
     loguru
     mamba-ssm
     marlin-kernels
+    moe-kernels
     opentelemetry-api
     opentelemetry-exporter-otlp
     opentelemetry-instrumentation-grpc

diff --git a/server/poetry.lock b/server/poetry.lock
diff --git a/server/pyproject.toml b/server/pyproject.toml
@@ -46,13 +46,20 @@ marlin-kernels = [
   { url = "https://github.com/danieldk/marlin-kernels/releases/download/v0.2.0/marlin_kernels-0.2.0+cu123torch2.4-cp311-cp311-linux_x86_64.whl", python = "~3.11", optional = true },
   { url = "https://github.com/danieldk/marlin-kernels/releases/download/v0.2.0/marlin_kernels-0.2.0+cu123torch2.4-cp312-cp312-linux_x86_64.whl", python = "~3.12", optional = true },
 ]
+moe-kernels = [
+  { url = "https://github.com/danieldk/moe-kernels/releases/download/v0.2.2/moe_kernels-0.2.2+cu123torch2.4-cp39-cp39-linux_x86_64.whl", python = "~3.9", optional = true },
+  { url = "https://github.com/danieldk/moe-kernels/releases/download/v0.2.2/moe_kernels-0.2.2+cu123torch2.4-cp310-cp310-linux_x86_64.whl", python = "~3.10", optional = true },
+  { url = "https://github.com/danieldk/moe-kernels/releases/download/v0.2.2/moe_kernels-0.2.2+cu123torch2.4-cp311-cp311-linux_x86_64.whl", python = "~3.11", optional = true },
+  { url = "https://github.com/danieldk/moe-kernels/releases/download/v0.2.2/moe_kernels-0.2.2+cu123torch2.4-cp312-cp312-linux_x86_64.whl", python = "~3.12", optional = true },
+]
 rich = "^13.7.1"
 
 [tool.poetry.extras]
 torch = ["torch"]
 accelerate = ["accelerate"]
 bnb = ["bitsandbytes"]
 marlin = ["marlin-kernels"]
+moe = ["moe-kernels"]
 peft = ["peft"]
 quantize = ["texttable", "datasets", "accelerate"]
 outlines = ["outlines"]

diff --git a/server/text_generation_server/layers/moe/__init__.py b/server/text_generation_server/layers/moe/__init__.py
@@ -0,0 +1,76 @@
+from typing import Optional
+
+import torch
+import torch.nn as nn
+from text_generation_server.layers.fp8 import HybridFP8UnquantLoader
+from text_generation_server.layers.moe.unquantized import UnquantizedSparseMoELayer
+from text_generation_server.utils.weights import (
+    DefaultWeightsLoader,
+    UnquantizedWeight,
+    Weights,
+)
+
+
+class SparseMoELayer(nn.Module):
+    """
+    Layer for MoE that uses fused kernels to only apply the active experts
+    for each token (rather than applying all experts and selecting the
+    outputs of active experts).
+    """
+
+    def __init__(
+        self,
+        *,
+        n_expert_group: Optional[int],
+        n_experts: int,
+        prefix: str,
+        renormalize: bool,
+        topk: int,
+        topk_group: Optional[int],
+        weights: Weights,
+        gate_proj_name: str = "gate_proj",
+        up_proj_name: str = "up_proj",
+        down_proj_name: str = "down_proj",
+    ):
+        super().__init__()
+
+        if (
+            isinstance(weights.loader, DefaultWeightsLoader)
+            and isinstance(weights.loader.weight_class, UnquantizedWeight)
+        ) or isinstance(weights.loader, HybridFP8UnquantLoader):
+            cls = UnquantizedSparseMoELayer
+            # Once we wire up GPTQ-Marlin MoE:
+            # elif isinstance(weights.loader, GPTQMarlinWeightsLoader) and weights.loader.sym:
+            # cls = GPTQMarlinSparseMoELayer
+        else:
+            raise ValueError(
+                f"Unsupported weights loader: {weights.loader}, sparse MoE is only supported for unquantized and GPTQ weights"
+            )
+
+        self.moe = cls(
+            n_expert_group=n_expert_group,
+            n_experts=n_experts,
+            prefix=prefix,
+            renormalize=renormalize,
+            topk=topk,
+            topk_group=topk_group,
+            weights=weights,
+            gate_proj_name=gate_proj_name,
+            up_proj_name=up_proj_name,
+            down_proj_name=down_proj_name,
+        )
+
+    def forward(self, x: torch.Tensor, *, gating_output: torch.Tensor) -> torch.Tensor:
+        return self.moe(x, gating_output=gating_output)
+
+    @staticmethod
+    def is_supported(weights: Weights) -> bool:
+        return (
+            (
+                isinstance(weights.loader, DefaultWeightsLoader)
+                and isinstance(weights.loader.weight_class, UnquantizedWeight)
+            )
+            or isinstance(weights.loader, HybridFP8UnquantLoader)
+            # Once we wire up GPTQ-Marlin MoE:
+            # or isinstance(weights.loader, GPTQMarlinWeightsLoader)
+        )