diff --git a/Dockerfile-cuda-blackwell b/Dockerfile-cuda-blackwell new file mode 100644 index 00000000..8614df59 --- /dev/null +++ b/Dockerfile-cuda-blackwell @@ -0,0 +1,145 @@ +FROM nvidia/cuda:12.9.0-devel-ubuntu22.04 AS base-builder + +ENV SCCACHE=0.10.0 +ENV RUSTC_WRAPPER=/usr/local/bin/sccache +ENV PATH="/root/.cargo/bin:${PATH}" +# aligned with `cargo-chef` version in `lukemathwalker/cargo-chef:latest-rust-1.85-bookworm` +ENV CARGO_CHEF=0.1.71 + +RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \ + curl \ + libssl-dev \ + pkg-config \ + && rm -rf /var/lib/apt/lists/* + +# Donwload and configure sccache +RUN curl -fsSL https://github.com/mozilla/sccache/releases/download/v$SCCACHE/sccache-v$SCCACHE-x86_64-unknown-linux-musl.tar.gz | tar -xzv --strip-components=1 -C /usr/local/bin sccache-v$SCCACHE-x86_64-unknown-linux-musl/sccache && \ + chmod +x /usr/local/bin/sccache + +RUN curl https://sh.rustup.rs -sSf | bash -s -- -y +RUN cargo install cargo-chef --version $CARGO_CHEF --locked + +FROM base-builder AS planner + +WORKDIR /usr/src + +COPY backends backends +COPY core core +COPY router router +COPY Cargo.toml ./ +COPY Cargo.lock ./ + +RUN cargo chef prepare --recipe-path recipe.json + +FROM base-builder AS builder + +ARG CUDA_COMPUTE_CAP=80 +ARG GIT_SHA +ARG DOCKER_LABEL + +# Limit parallelism +ARG RAYON_NUM_THREADS +ARG CARGO_BUILD_JOBS +ARG CARGO_BUILD_INCREMENTAL + +# sccache specific variables +ARG SCCACHE_GHA_ENABLED + +WORKDIR /usr/src + +RUN --mount=type=secret,id=actions_results_url,env=ACTIONS_RESULTS_URL \ + --mount=type=secret,id=actions_runtime_token,env=ACTIONS_RUNTIME_TOKEN \ + if [ ${CUDA_COMPUTE_CAP} -ge 75 -a ${CUDA_COMPUTE_CAP} -lt 80 ]; \ + then \ + nvprune --generate-code code=sm_${CUDA_COMPUTE_CAP} /usr/local/cuda/lib64/libcublas_static.a -o /usr/local/cuda/lib64/libcublas_static.a; \ + elif [ ${CUDA_COMPUTE_CAP} -ge 80 -a ${CUDA_COMPUTE_CAP} -lt 90 ]; \ + then \ + nvprune --generate-code code=sm_80 --generate-code code=sm_${CUDA_COMPUTE_CAP} /usr/local/cuda/lib64/libcublas_static.a -o /usr/local/cuda/lib64/libcublas_static.a; \ + elif [ ${CUDA_COMPUTE_CAP} -eq 90 ]; \ + then \ + nvprune --generate-code code=sm_90 /usr/local/cuda/lib64/libcublas_static.a -o /usr/local/cuda/lib64/libcublas_static.a; \ + elif [ ${CUDA_COMPUTE_CAP} -eq 120 ]; \ + then \ + nvprune --generate-code code=sm_120 /usr/local/cuda/lib64/libcublas_static.a -o /usr/local/cuda/lib64/libcublas_static.a; \ + else \ + echo "cuda compute cap ${CUDA_COMPUTE_CAP} is not supported"; exit 1; \ + fi; + +COPY --from=planner /usr/src/recipe.json recipe.json + +RUN --mount=type=secret,id=actions_results_url,env=ACTIONS_RESULTS_URL \ + --mount=type=secret,id=actions_runtime_token,env=ACTIONS_RUNTIME_TOKEN \ + if [ ${CUDA_COMPUTE_CAP} -ge 75 -a ${CUDA_COMPUTE_CAP} -lt 80 ]; \ + then \ + cargo chef cook --release --features candle-cuda-turing --features static-linking --no-default-features --recipe-path recipe.json && sccache -s; \ + else \ + cargo chef cook --release --features candle-cuda --features static-linking --no-default-features --recipe-path recipe.json && sccache -s; \ + fi; + +COPY backends backends +COPY core core +COPY router router +COPY Cargo.toml ./ +COPY Cargo.lock ./ + +FROM builder AS http-builder + +RUN --mount=type=secret,id=actions_results_url,env=ACTIONS_RESULTS_URL \ + --mount=type=secret,id=actions_runtime_token,env=ACTIONS_RUNTIME_TOKEN \ + if [ ${CUDA_COMPUTE_CAP} -ge 75 -a ${CUDA_COMPUTE_CAP} -lt 80 ]; \ + then \ + cargo build --release --bin text-embeddings-router -F candle-cuda-turing -F static-linking -F http --no-default-features && sccache -s; \ + else \ + cargo build --release --bin text-embeddings-router -F candle-cuda -F static-linking -F http --no-default-features && sccache -s; \ + fi; + +FROM builder AS grpc-builder + +RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \ + unzip \ + && rm -rf /var/lib/apt/lists/* + +RUN PROTOC_ZIP=protoc-21.12-linux-x86_64.zip && \ + curl -OL https://github.com/protocolbuffers/protobuf/releases/download/v21.12/$PROTOC_ZIP && \ + unzip -o $PROTOC_ZIP -d /usr/local bin/protoc && \ + unzip -o $PROTOC_ZIP -d /usr/local 'include/*' && \ + rm -f $PROTOC_ZIP + +COPY proto proto + +RUN --mount=type=secret,id=actions_results_url,env=ACTIONS_RESULTS_URL \ + --mount=type=secret,id=actions_runtime_token,env=ACTIONS_RUNTIME_TOKEN \ + if [ ${CUDA_COMPUTE_CAP} -ge 75 -a ${CUDA_COMPUTE_CAP} -lt 80 ]; \ + then \ + cargo build --release --bin text-embeddings-router -F candle-cuda-turing -F static-linking -F grpc --no-default-features && sccache -s; \ + else \ + cargo build --release --bin text-embeddings-router -F candle-cuda -F static-linking -F grpc --no-default-features && sccache -s; \ + fi; + +FROM nvidia/cuda:12.9.0-base-ubuntu22.04 AS base + +ARG DEFAULT_USE_FLASH_ATTENTION=True + +ENV HUGGINGFACE_HUB_CACHE=/data \ + PORT=80 \ + USE_FLASH_ATTENTION=$DEFAULT_USE_FLASH_ATTENTION + +RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \ + ca-certificates \ + libssl-dev \ + curl \ + && rm -rf /var/lib/apt/lists/* + +FROM base AS grpc + +COPY --from=grpc-builder /usr/src/target/release/text-embeddings-router /usr/local/bin/text-embeddings-router + +ENTRYPOINT ["text-embeddings-router"] +CMD ["--json-output"] + +FROM base + +COPY --from=http-builder /usr/src/target/release/text-embeddings-router /usr/local/bin/text-embeddings-router + +ENTRYPOINT ["text-embeddings-router"] +CMD ["--json-output"] diff --git a/README.md b/README.md index e9e98d0b..13a0d2dc 100644 --- a/README.md +++ b/README.md @@ -557,6 +557,8 @@ You can build the CPU container with: docker build . ``` +### CUDA - Pre Blackwell architecture + To build the CUDA containers, you need to know the compute cap of the GPU you will be using at runtime. @@ -584,6 +586,39 @@ runtime_compute_cap=90 docker build . -f Dockerfile-cuda --build-arg CUDA_COMPUTE_CAP=$runtime_compute_cap ``` +### CUDA - Blackwell architecture + +To build the CUDA containers for the Blackwell architecture CUDA 12.9 is required, you need to use a different Dockerfile +and set the compute cap to 120. +This Dockerfile can still be used to build for previous architectures. + +Commands to build the container: + +```shell +# Get submodule dependencies +git submodule update --init + +# Example for Turing (T4, RTX 2000 series, ...) +runtime_compute_cap=75 + +# Example for A100 +runtime_compute_cap=80 + +# Example for A10 +runtime_compute_cap=86 + +# Example for Ada Lovelace (RTX 4000 series, ...) +runtime_compute_cap=89 + +# Example for H100 +runtime_compute_cap=90 + +# Example for Blackwell (RTX 5000 series, ...) +runtime_compute_cap=120 + +docker build . -f Dockerfile-cuda-blackwell --build-arg CUDA_COMPUTE_CAP=$runtime_compute_cap +``` + ### Apple M1/M2 arm64 architectures #### DISCLAIMER diff --git a/backends/candle/src/compute_cap.rs b/backends/candle/src/compute_cap.rs index ac79fcf1..56978f5d 100644 --- a/backends/candle/src/compute_cap.rs +++ b/backends/candle/src/compute_cap.rs @@ -30,6 +30,7 @@ fn compute_cap_matching(runtime_compute_cap: usize, compile_compute_cap: usize) (86..=89, 80..=86) => true, (89, 89) => true, (90, 90) => true, + (120, 120) => true, (_, _) => false, } } @@ -54,6 +55,7 @@ mod tests { assert!(compute_cap_matching(86, 86)); assert!(compute_cap_matching(89, 89)); assert!(compute_cap_matching(90, 90)); + assert!(compute_cap_matching(120, 120)); assert!(compute_cap_matching(86, 80)); assert!(compute_cap_matching(89, 80)); diff --git a/backends/candle/src/flash_attn.rs b/backends/candle/src/flash_attn.rs index 8dbe58cf..f1b69c72 100644 --- a/backends/candle/src/flash_attn.rs +++ b/backends/candle/src/flash_attn.rs @@ -61,7 +61,7 @@ pub(crate) fn flash_attn_varlen( } #[cfg(not(feature = "flash-attn-v1"))] candle::bail!("Flash attention v1 is not installed. Use `flash-attn-v1` feature.") - } else if (80..90).contains(&runtime_compute_cap) || runtime_compute_cap == 90 { + } else if (80..90).contains(&runtime_compute_cap) || runtime_compute_cap == 90 || runtime_compute_cap == 120 { #[cfg(feature = "flash-attn")] { use candle_flash_attn::{flash_attn_varlen_alibi_windowed, flash_attn_varlen_windowed}; diff --git a/docs/source/en/custom_container.md b/docs/source/en/custom_container.md index c670026c..bbdbae27 100644 --- a/docs/source/en/custom_container.md +++ b/docs/source/en/custom_container.md @@ -16,7 +16,7 @@ rendered properly in your Markdown viewer. # Build a custom container for TEI -You can build our own CPU or CUDA TEI container using Docker. To build a CPU container, run the following command in the +You can build our own CPU or CUDA TEI container using Docker. To build a CPU container, run the following command in the directory containing your custom Dockerfile: ```shell @@ -32,9 +32,11 @@ the examples of runtime compute capabilities for various GPU types: - A10 - `runtime_compute_cap=86` - Ada Lovelace (RTX 4000 series, ...) - `runtime_compute_cap=89` - H100 - `runtime_compute_cap=90` +- Blackwell (RTX 5000 series, ...) - `runtime_compute_cap=120` Once you have determined the compute capability is determined, set it as the `runtime_compute_cap` variable and build -the container as shown in the example below: +the container using `Dockerfile-cuda` if the runtime compute cap is lower than 120 otherwise use +`Dockerfile-cuda-blackwell` as shown in the example below: ```shell # Get submodule dependencies