Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/build.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ jobs:
export dockerfile="Dockerfile"
export label_extension=""
export docker_devices=""
export runs_on="aws-g6-12xlarge-plus-priv"
export runs_on="aws-g6-12xl-plus-priv-cache"
export platform=""
;;
rocm)
Expand Down
6 changes: 6 additions & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -258,7 +258,7 @@ COPY server/Makefile server/Makefile
RUN cd server && \
make gen-server && \
pip install -r requirements_cuda.txt && \
pip install ".[bnb, accelerate, marlin, quantize, peft, outlines]" --no-cache-dir && \
pip install ".[bnb, accelerate, marlin, moe, quantize, peft, outlines]" --no-cache-dir && \
pip install nvidia-nccl-cu12==2.22.3

ENV LD_PRELOAD=/opt/conda/lib/python3.11/site-packages/nvidia/nccl/lib/libnccl.so.2
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,17 +16,17 @@
},
{
"id": 28804,
"logprob": -7.4335938,
"logprob": -7.4375,
"text": "?"
},
{
"id": 13,
"logprob": -0.8017578,
"logprob": -0.8046875,
"text": "\n"
},
{
"id": 13,
"logprob": -0.32958984,
"logprob": -0.33032227,
"text": "\n"
}
],
Expand Down Expand Up @@ -64,7 +64,7 @@
},
{
"id": 369,
"logprob": -0.06585693,
"logprob": 0.0,
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Huh ?

Copy link
Member Author

@danieldk danieldk Sep 17, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Also see the " me" above, and that was before the changes.

Copy link
Member Author

@danieldk danieldk Sep 17, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Guess that's what a temperature of 0.5 does. Happens a lot though:

❯ git grep '"logprob": 0\.0,' | uniq | wc -l
33

"special": false,
"text": " that"
},
Expand Down
2 changes: 2 additions & 0 deletions nix/server.nix
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
loguru,
mamba-ssm,
marlin-kernels,
moe-kernels,
opentelemetry-api,
opentelemetry-exporter-otlp,
opentelemetry-instrumentation-grpc,
Expand Down Expand Up @@ -88,6 +89,7 @@ buildPythonPackage {
loguru
mamba-ssm
marlin-kernels
moe-kernels
opentelemetry-api
opentelemetry-exporter-otlp
opentelemetry-instrumentation-grpc
Expand Down
88 changes: 88 additions & 0 deletions server/poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

7 changes: 7 additions & 0 deletions server/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -46,13 +46,20 @@ marlin-kernels = [
{ url = "https://github.com/danieldk/marlin-kernels/releases/download/v0.2.0/marlin_kernels-0.2.0+cu123torch2.4-cp311-cp311-linux_x86_64.whl", python = "~3.11", optional = true },
{ url = "https://github.com/danieldk/marlin-kernels/releases/download/v0.2.0/marlin_kernels-0.2.0+cu123torch2.4-cp312-cp312-linux_x86_64.whl", python = "~3.12", optional = true },
]
moe-kernels = [
{ url = "https://github.com/danieldk/moe-kernels/releases/download/v0.2.2/moe_kernels-0.2.2+cu123torch2.4-cp39-cp39-linux_x86_64.whl", python = "~3.9", optional = true },
{ url = "https://github.com/danieldk/moe-kernels/releases/download/v0.2.2/moe_kernels-0.2.2+cu123torch2.4-cp310-cp310-linux_x86_64.whl", python = "~3.10", optional = true },
{ url = "https://github.com/danieldk/moe-kernels/releases/download/v0.2.2/moe_kernels-0.2.2+cu123torch2.4-cp311-cp311-linux_x86_64.whl", python = "~3.11", optional = true },
{ url = "https://github.com/danieldk/moe-kernels/releases/download/v0.2.2/moe_kernels-0.2.2+cu123torch2.4-cp312-cp312-linux_x86_64.whl", python = "~3.12", optional = true },
]
rich = "^13.7.1"

[tool.poetry.extras]
torch = ["torch"]
accelerate = ["accelerate"]
bnb = ["bitsandbytes"]
marlin = ["marlin-kernels"]
moe = ["moe-kernels"]
peft = ["peft"]
quantize = ["texttable", "datasets", "accelerate"]
outlines = ["outlines"]
Expand Down
76 changes: 76 additions & 0 deletions server/text_generation_server/layers/moe/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
from typing import Optional

import torch
import torch.nn as nn
from text_generation_server.layers.fp8 import HybridFP8UnquantLoader
from text_generation_server.layers.moe.unquantized import UnquantizedSparseMoELayer
from text_generation_server.utils.weights import (
DefaultWeightsLoader,
UnquantizedWeight,
Weights,
)


class SparseMoELayer(nn.Module):
"""
Layer for MoE that uses fused kernels to only apply the active experts
for each token (rather than applying all experts and selecting the
outputs of active experts).
"""

def __init__(
self,
*,
n_expert_group: Optional[int],
n_experts: int,
prefix: str,
renormalize: bool,
topk: int,
topk_group: Optional[int],
weights: Weights,
gate_proj_name: str = "gate_proj",
up_proj_name: str = "up_proj",
down_proj_name: str = "down_proj",
):
super().__init__()

if (
isinstance(weights.loader, DefaultWeightsLoader)
and isinstance(weights.loader.weight_class, UnquantizedWeight)
) or isinstance(weights.loader, HybridFP8UnquantLoader):
cls = UnquantizedSparseMoELayer
# Once we wire up GPTQ-Marlin MoE:
# elif isinstance(weights.loader, GPTQMarlinWeightsLoader) and weights.loader.sym:
# cls = GPTQMarlinSparseMoELayer
else:
raise ValueError(
f"Unsupported weights loader: {weights.loader}, sparse MoE is only supported for unquantized and GPTQ weights"
)

self.moe = cls(
n_expert_group=n_expert_group,
n_experts=n_experts,
prefix=prefix,
renormalize=renormalize,
topk=topk,
topk_group=topk_group,
weights=weights,
gate_proj_name=gate_proj_name,
up_proj_name=up_proj_name,
down_proj_name=down_proj_name,
)

def forward(self, x: torch.Tensor, *, gating_output: torch.Tensor) -> torch.Tensor:
return self.moe(x, gating_output=gating_output)

@staticmethod
def is_supported(weights: Weights) -> bool:
return (
(
isinstance(weights.loader, DefaultWeightsLoader)
and isinstance(weights.loader.weight_class, UnquantizedWeight)
)
or isinstance(weights.loader, HybridFP8UnquantLoader)
# Once we wire up GPTQ-Marlin MoE:
# or isinstance(weights.loader, GPTQMarlinWeightsLoader)
)
Loading