diff --git a/.devops/cpu.Dockerfile b/.devops/cpu.Dockerfile new file mode 100644 index 00000000000..8d020f16c97 --- /dev/null +++ b/.devops/cpu.Dockerfile @@ -0,0 +1,81 @@ +ARG UBUNTU_VERSION=22.04 + +FROM ubuntu:$UBUNTU_VERSION AS build + +RUN apt-get update && \ + apt-get install -y build-essential git cmake libcurl4-openssl-dev + +WORKDIR /app + +COPY . . + +RUN cmake -S . -B build -DGGML_BACKEND_DL=ON -DGGML_NATIVE=OFF -DGGML_CPU_ALL_VARIANTS=ON -DLLAMA_CURL=ON -DCMAKE_BUILD_TYPE=Release && \ + cmake --build build -j $(nproc) + +RUN mkdir -p /app/lib && \ + find build -name "*.so" -exec cp {} /app/lib \; + +RUN mkdir -p /app/full \ + && cp build/bin/* /app/full \ + && cp *.py /app/full \ + && cp -r gguf-py /app/full \ + && cp -r requirements /app/full \ + && cp requirements.txt /app/full \ + && cp .devops/tools.sh /app/full/tools.sh + +## Base image +FROM ubuntu:$UBUNTU_VERSION AS base + +RUN apt-get update \ + && apt-get install -y libgomp1 curl\ + && apt autoremove -y \ + && apt clean -y \ + && rm -rf /tmp/* /var/tmp/* \ + && find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \ + && find /var/cache -type f -delete + +COPY --from=build /app/lib/ /app + +### Full +FROM base AS full + +COPY --from=build /app/full /app + +WORKDIR /app + +RUN apt-get update \ + && apt-get install -y \ + git \ + python3 \ + python3-pip \ + && pip install --upgrade pip setuptools wheel \ + && pip install -r requirements.txt \ + && apt autoremove -y \ + && apt clean -y \ + && rm -rf /tmp/* /var/tmp/* \ + && find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \ + && find /var/cache -type f -delete + +ENTRYPOINT ["/app/tools.sh"] + +### Light, CLI only +FROM base AS light + +COPY --from=build /app/full/llama-cli /app + +WORKDIR /app + +ENTRYPOINT [ "/app/llama-cli" ] + +### Server, Server only +FROM base AS server + +ENV LLAMA_ARG_HOST=0.0.0.0 + +COPY --from=build /app/full/llama-server /app + +WORKDIR /app + +HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ] + +ENTRYPOINT [ "/app/llama-server" ] diff --git a/.devops/cuda.Dockerfile b/.devops/cuda.Dockerfile new file mode 100644 index 00000000000..974dd78a8b0 --- /dev/null +++ b/.devops/cuda.Dockerfile @@ -0,0 +1,94 @@ +ARG UBUNTU_VERSION=22.04 +# This needs to generally match the container host's environment. +ARG CUDA_VERSION=12.6.0 +# Target the CUDA build image +ARG BASE_CUDA_DEV_CONTAINER=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VERSION} + +ARG BASE_CUDA_RUN_CONTAINER=nvidia/cuda:${CUDA_VERSION}-runtime-ubuntu${UBUNTU_VERSION} + +FROM ${BASE_CUDA_DEV_CONTAINER} AS build + +# CUDA architecture to build for (defaults to all supported archs) +ARG CUDA_DOCKER_ARCH=default + +RUN apt-get update && \ + apt-get install -y build-essential cmake python3 python3-pip git libcurl4-openssl-dev libgomp1 + +WORKDIR /app + +COPY . . + +RUN if [ "${CUDA_DOCKER_ARCH}" != "default" ]; then \ + export CMAKE_ARGS="-DCMAKE_CUDA_ARCHITECTURES=${CUDA_DOCKER_ARCH}"; \ + fi && \ + cmake -B build -DGGML_NATIVE=OFF -DGGML_CUDA=ON -DLLAMA_CURL=ON ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \ + cmake --build build --config Release -j$(nproc) + +RUN mkdir -p /app/lib && \ + find build -name "*.so" -exec cp {} /app/lib \; + +RUN mkdir -p /app/full \ + && cp build/bin/* /app/full \ + && cp *.py /app/full \ + && cp -r gguf-py /app/full \ + && cp -r requirements /app/full \ + && cp requirements.txt /app/full \ + && cp .devops/tools.sh /app/full/tools.sh + +## Base image +FROM ${BASE_CUDA_RUN_CONTAINER} AS base + +RUN apt-get update \ + && apt-get install -y libgomp1 curl\ + && apt autoremove -y \ + && apt clean -y \ + && rm -rf /tmp/* /var/tmp/* \ + && find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \ + && find /var/cache -type f -delete + +COPY --from=build /app/lib/ /app + +### Full +FROM base AS full + +COPY --from=build /app/full /app + +WORKDIR /app + +RUN apt-get update \ + && apt-get install -y \ + git \ + python3 \ + python3-pip \ + && pip install --upgrade pip setuptools wheel \ + && pip install -r requirements.txt \ + && apt autoremove -y \ + && apt clean -y \ + && rm -rf /tmp/* /var/tmp/* \ + && find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \ + && find /var/cache -type f -delete + + +ENTRYPOINT ["/app/tools.sh"] + +### Light, CLI only +FROM base AS light + +COPY --from=build /app/full/llama-cli /app + +WORKDIR /app + +ENTRYPOINT [ "/app/llama-cli" ] + +### Server, Server only +FROM base AS server + +ENV LLAMA_ARG_HOST=0.0.0.0 + +COPY --from=build /app/full/llama-server /app + +WORKDIR /app + +HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ] + +ENTRYPOINT [ "/app/llama-server" ] diff --git a/.devops/full-cuda.Dockerfile b/.devops/full-cuda.Dockerfile deleted file mode 100644 index 05bff1bdf6a..00000000000 --- a/.devops/full-cuda.Dockerfile +++ /dev/null @@ -1,33 +0,0 @@ -ARG UBUNTU_VERSION=22.04 -# This needs to generally match the container host's environment. -ARG CUDA_VERSION=12.6.0 -# Target the CUDA build image -ARG BASE_CUDA_DEV_CONTAINER=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VERSION} - -FROM ${BASE_CUDA_DEV_CONTAINER} AS build - -# CUDA architecture to build for (defaults to all supported archs) -ARG CUDA_DOCKER_ARCH=default - -RUN apt-get update && \ - apt-get install -y build-essential cmake python3 python3-pip git libcurl4-openssl-dev libgomp1 - -COPY requirements.txt requirements.txt -COPY requirements requirements - -RUN pip install --upgrade pip setuptools wheel \ - && pip install -r requirements.txt - -WORKDIR /app - -COPY . . - -# Use the default CUDA archs if not specified -RUN if [ "${CUDA_DOCKER_ARCH}" != "default" ]; then \ - export CMAKE_ARGS="-DCMAKE_CUDA_ARCHITECTURES=${CUDA_DOCKER_ARCH}"; \ - fi && \ - cmake -B build -DGGML_NATIVE=OFF -DGGML_CUDA=ON -DLLAMA_CURL=ON ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \ - cmake --build build --config Release -j$(nproc) && \ - cp build/bin/* . - -ENTRYPOINT ["/app/.devops/tools.sh"] diff --git a/.devops/full-musa.Dockerfile b/.devops/full-musa.Dockerfile deleted file mode 100644 index 3193fea1e9a..00000000000 --- a/.devops/full-musa.Dockerfile +++ /dev/null @@ -1,33 +0,0 @@ -ARG UBUNTU_VERSION=22.04 -# This needs to generally match the container host's environment. -ARG MUSA_VERSION=rc3.1.0 -# Target the MUSA build image -ARG BASE_MUSA_DEV_CONTAINER=mthreads/musa:${MUSA_VERSION}-devel-ubuntu${UBUNTU_VERSION} - -FROM ${BASE_MUSA_DEV_CONTAINER} AS build - -# MUSA architecture to build for (defaults to all supported archs) -ARG MUSA_DOCKER_ARCH=default - -RUN apt-get update && \ - apt-get install -y build-essential cmake python3 python3-pip git libcurl4-openssl-dev libgomp1 - -COPY requirements.txt requirements.txt -COPY requirements requirements - -RUN pip install --upgrade pip setuptools wheel \ - && pip install -r requirements.txt - -WORKDIR /app - -COPY . . - -# Use the default MUSA archs if not specified -RUN if [ "${MUSA_DOCKER_ARCH}" != "default" ]; then \ - export CMAKE_ARGS="-DMUSA_ARCHITECTURES=${MUSA_DOCKER_ARCH}"; \ - fi && \ - cmake -B build -DGGML_NATIVE=OFF -DGGML_MUSA=ON -DLLAMA_CURL=ON ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \ - cmake --build build --config Release -j$(nproc) && \ - cp build/bin/* . - -ENTRYPOINT ["/app/.devops/tools.sh"] diff --git a/.devops/full-rocm.Dockerfile b/.devops/full-rocm.Dockerfile deleted file mode 100644 index df496bcd2b7..00000000000 --- a/.devops/full-rocm.Dockerfile +++ /dev/null @@ -1,50 +0,0 @@ -ARG UBUNTU_VERSION=22.04 - -# This needs to generally match the container host's environment. -ARG ROCM_VERSION=5.6 - -# Target the CUDA build image -ARG BASE_ROCM_DEV_CONTAINER=rocm/dev-ubuntu-${UBUNTU_VERSION}:${ROCM_VERSION}-complete - -FROM ${BASE_ROCM_DEV_CONTAINER} AS build - -# Unless otherwise specified, we make a fat build. -# List from https://github.com/ggerganov/llama.cpp/pull/1087#issuecomment-1682807878 -# This is mostly tied to rocBLAS supported archs. -ARG ROCM_DOCKER_ARCH="\ - gfx803 \ - gfx900 \ - gfx906 \ - gfx908 \ - gfx90a \ - gfx1010 \ - gfx1030 \ - gfx1100 \ - gfx1101 \ - gfx1102" - -COPY requirements.txt requirements.txt -COPY requirements requirements - -RUN pip install --upgrade pip setuptools wheel \ - && pip install -r requirements.txt - -WORKDIR /app - -COPY . . - -# Set nvcc architecture -ENV AMDGPU_TARGETS=${ROCM_DOCKER_ARCH} -# Enable ROCm -ENV GGML_HIPBLAS=1 -ENV CC=/opt/rocm/llvm/bin/clang -ENV CXX=/opt/rocm/llvm/bin/clang++ - -# Enable cURL -ENV LLAMA_CURL=1 -RUN apt-get update && \ - apt-get install -y libcurl4-openssl-dev - -RUN make -j$(nproc) - -ENTRYPOINT ["/app/.devops/tools.sh"] diff --git a/.devops/full.Dockerfile b/.devops/full.Dockerfile deleted file mode 100644 index d93c0be6a70..00000000000 --- a/.devops/full.Dockerfile +++ /dev/null @@ -1,38 +0,0 @@ -ARG UBUNTU_VERSION=22.04 - -FROM ubuntu:$UBUNTU_VERSION AS build - -RUN apt-get update && \ - apt-get install -y build-essential git cmake libcurl4-openssl-dev - -WORKDIR /app - -COPY . . - -RUN cmake -S . -B build -DGGML_BACKEND_DL=ON -DGGML_NATIVE=OFF -DGGML_CPU_ALL_VARIANTS=ON -DLLAMA_CURL=ON -DCMAKE_BUILD_TYPE=Release && \ - cmake --build build -j $(nproc) && \ - mkdir -p /app/lib && \ - find build -name "*.so" -exec cp {} /app/lib/ \; - -FROM ubuntu:$UBUNTU_VERSION as runtime - -WORKDIR /app - -RUN apt-get update && \ - apt-get install -y build-essential python3 python3-pip git libcurl4-openssl-dev libgomp1 - -COPY requirements.txt /app/requirements.txt -COPY requirements /app/requirements -COPY .devops/tools.sh /app/tools.sh - -RUN pip install --upgrade pip setuptools wheel && \ - pip install -r /app/requirements.txt - -COPY --from=build /app/build/bin/ /app/ -COPY --from=build /app/lib/ /app/ -COPY --from=build /app/convert_hf_to_gguf.py /app/ -COPY --from=build /app/gguf-py /app/gguf-py - -ENV LC_ALL=C.utf8 - -ENTRYPOINT ["/app/tools.sh"] diff --git a/.devops/intel.Dockerfile b/.devops/intel.Dockerfile new file mode 100644 index 00000000000..af783f5e998 --- /dev/null +++ b/.devops/intel.Dockerfile @@ -0,0 +1,91 @@ +ARG ONEAPI_VERSION=2025.0.0-0-devel-ubuntu22.04 + +## Build Image + +FROM intel/oneapi-basekit:$ONEAPI_VERSION AS build + +ARG GGML_SYCL_F16=OFF +RUN apt-get update && \ + apt-get install -y git libcurl4-openssl-dev + +WORKDIR /app + +COPY . . + +RUN if [ "${GGML_SYCL_F16}" = "ON" ]; then \ + echo "GGML_SYCL_F16 is set" \ + && export OPT_SYCL_F16="-DGGML_SYCL_F16=ON"; \ + fi && \ + echo "Building with dynamic libs" && \ + cmake -B build -DGGML_NATIVE=OFF -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_CURL=ON ${OPT_SYCL_F16} && \ + cmake --build build --config Release -j$(nproc) + +RUN mkdir -p /app/lib && \ + find build -name "*.so" -exec cp {} /app/lib \; + +RUN mkdir -p /app/full \ + && cp build/bin/* /app/full \ + && cp *.py /app/full \ + && cp -r gguf-py /app/full \ + && cp -r requirements /app/full \ + && cp requirements.txt /app/full \ + && cp .devops/tools.sh /app/full/tools.sh + +FROM intel/oneapi-basekit:$ONEAPI_VERSION AS base + +RUN apt-get update \ + && apt-get install -y libgomp1 curl\ + && apt autoremove -y \ + && apt clean -y \ + && rm -rf /tmp/* /var/tmp/* \ + && find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \ + && find /var/cache -type f -delete + +### Full +FROM base AS full + +COPY --from=build /app/lib/ /app +COPY --from=build /app/full /app + +WORKDIR /app + +RUN apt-get update \ + && apt-get install -y \ + git \ + python3 \ + python3-pip \ + && pip install --upgrade pip setuptools wheel \ + && pip install -r requirements.txt \ + && apt autoremove -y \ + && apt clean -y \ + && rm -rf /tmp/* /var/tmp/* \ + && find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \ + && find /var/cache -type f -delete + + +ENTRYPOINT ["/app/tools.sh"] + +### Light, CLI only +FROM base AS light + +COPY --from=build /app/lib/ /app +COPY --from=build /app/full/llama-cli /app + +WORKDIR /app + +ENTRYPOINT [ "/app/llama-cli" ] + +### Server, Server only +FROM base AS server + +ENV LLAMA_ARG_HOST=0.0.0.0 + +COPY --from=build /app/lib/ /app +COPY --from=build /app/full/llama-server /app + +WORKDIR /app + +HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ] + +ENTRYPOINT [ "/app/llama-server" ] + diff --git a/.devops/llama-cli-cuda.Dockerfile b/.devops/llama-cli-cuda.Dockerfile deleted file mode 100644 index 7796891d5b5..00000000000 --- a/.devops/llama-cli-cuda.Dockerfile +++ /dev/null @@ -1,38 +0,0 @@ -ARG UBUNTU_VERSION=22.04 -# This needs to generally match the container host's environment. -ARG CUDA_VERSION=12.6.0 -# Target the CUDA build image -ARG BASE_CUDA_DEV_CONTAINER=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VERSION} -# Target the CUDA runtime image -ARG BASE_CUDA_RUN_CONTAINER=nvidia/cuda:${CUDA_VERSION}-runtime-ubuntu${UBUNTU_VERSION} - -FROM ${BASE_CUDA_DEV_CONTAINER} AS build - -# CUDA architecture to build for (defaults to all supported archs) -ARG CUDA_DOCKER_ARCH=default - -RUN apt-get update && \ - apt-get install -y build-essential git cmake - -WORKDIR /app - -COPY . . - -# Use the default CUDA archs if not specified -RUN if [ "${CUDA_DOCKER_ARCH}" != "default" ]; then \ - export CMAKE_ARGS="-DCMAKE_CUDA_ARCHITECTURES=${CUDA_DOCKER_ARCH}"; \ - fi && \ - cmake -B build -DGGML_NATIVE=OFF -DGGML_CUDA=ON ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \ - cmake --build build --config Release --target llama-cli -j$(nproc) && \ - mkdir -p /app/lib && \ - find build -name "*.so" -exec cp {} /app/lib \; - -FROM ${BASE_CUDA_RUN_CONTAINER} AS runtime - -RUN apt-get update && \ - apt-get install -y libgomp1 - -COPY --from=build /app/lib/ / -COPY --from=build /app/build/bin/llama-cli / - -ENTRYPOINT [ "/llama-cli" ] diff --git a/.devops/llama-cli-intel.Dockerfile b/.devops/llama-cli-intel.Dockerfile deleted file mode 100644 index 0706f732a98..00000000000 --- a/.devops/llama-cli-intel.Dockerfile +++ /dev/null @@ -1,28 +0,0 @@ -ARG ONEAPI_VERSION=2025.0.0-0-devel-ubuntu22.04 - -FROM intel/oneapi-basekit:$ONEAPI_VERSION AS build - -ARG GGML_SYCL_F16=OFF -RUN apt-get update && \ - apt-get install -y git - -WORKDIR /app - -COPY . . - -RUN if [ "${GGML_SYCL_F16}" = "ON" ]; then \ - echo "GGML_SYCL_F16 is set" && \ - export OPT_SYCL_F16="-DGGML_SYCL_F16=ON"; \ - fi && \ - echo "Building with static libs" && \ - cmake -B build -DGGML_NATIVE=OFF -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx \ - ${OPT_SYCL_F16} -DBUILD_SHARED_LIBS=OFF && \ - cmake --build build --config Release --target llama-cli - -FROM intel/oneapi-basekit:$ONEAPI_VERSION AS runtime - -COPY --from=build /app/build/bin/llama-cli /llama-cli - -ENV LC_ALL=C.utf8 - -ENTRYPOINT [ "/llama-cli" ] diff --git a/.devops/llama-cli-musa.Dockerfile b/.devops/llama-cli-musa.Dockerfile deleted file mode 100644 index e7c75af20e2..00000000000 --- a/.devops/llama-cli-musa.Dockerfile +++ /dev/null @@ -1,38 +0,0 @@ -ARG UBUNTU_VERSION=22.04 -# This needs to generally match the container host's environment. -ARG MUSA_VERSION=rc3.1.0 -# Target the MUSA build image -ARG BASE_MUSA_DEV_CONTAINER=mthreads/musa:${MUSA_VERSION}-devel-ubuntu${UBUNTU_VERSION} -# Target the MUSA runtime image -ARG BASE_MUSA_RUN_CONTAINER=mthreads/musa:${MUSA_VERSION}-runtime-ubuntu${UBUNTU_VERSION} - -FROM ${BASE_MUSA_DEV_CONTAINER} AS build - -# MUSA architecture to build for (defaults to all supported archs) -ARG MUSA_DOCKER_ARCH=default - -RUN apt-get update && \ - apt-get install -y build-essential git cmake - -WORKDIR /app - -COPY . . - -# Use the default MUSA archs if not specified -RUN if [ "${MUSA_DOCKER_ARCH}" != "default" ]; then \ - export CMAKE_ARGS="-DMUSA_ARCHITECTURES=${MUSA_DOCKER_ARCH}"; \ - fi && \ - cmake -B build -DGGML_NATIVE=OFF -DGGML_MUSA=ON ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \ - cmake --build build --config Release --target llama-cli -j$(nproc) && \ - mkdir -p /app/lib && \ - find build -name "*.so" -exec cp {} /app/lib \; - -FROM ${BASE_MUSA_RUN_CONTAINER} AS runtime - -RUN apt-get update && \ - apt-get install -y libgomp1 - -COPY --from=build /app/lib/ / -COPY --from=build /app/build/bin/llama-cli /llama-cli - -ENTRYPOINT [ "/llama-cli" ] diff --git a/.devops/llama-cli-rocm.Dockerfile b/.devops/llama-cli-rocm.Dockerfile deleted file mode 100644 index e60c747bdbf..00000000000 --- a/.devops/llama-cli-rocm.Dockerfile +++ /dev/null @@ -1,45 +0,0 @@ -ARG UBUNTU_VERSION=22.04 - -# This needs to generally match the container host's environment. -ARG ROCM_VERSION=5.6 - -# Target the CUDA build image -ARG BASE_ROCM_DEV_CONTAINER=rocm/dev-ubuntu-${UBUNTU_VERSION}:${ROCM_VERSION}-complete - -FROM ${BASE_ROCM_DEV_CONTAINER} AS build - -# Unless otherwise specified, we make a fat build. -# List from https://github.com/ggerganov/llama.cpp/pull/1087#issuecomment-1682807878 -# This is mostly tied to rocBLAS supported archs. -ARG ROCM_DOCKER_ARCH="\ - gfx803 \ - gfx900 \ - gfx906 \ - gfx908 \ - gfx90a \ - gfx1010 \ - gfx1030 \ - gfx1100 \ - gfx1101 \ - gfx1102" - -COPY requirements.txt requirements.txt -COPY requirements requirements - -RUN pip install --upgrade pip setuptools wheel \ - && pip install -r requirements.txt - -WORKDIR /app - -COPY . . - -# Set nvcc architecture -ENV AMDGPU_TARGETS=${ROCM_DOCKER_ARCH} -# Enable ROCm -ENV GGML_HIPBLAS=1 -ENV CC=/opt/rocm/llvm/bin/clang -ENV CXX=/opt/rocm/llvm/bin/clang++ - -RUN make -j$(nproc) llama-cli - -ENTRYPOINT [ "/app/llama-cli" ] diff --git a/.devops/llama-cli-vulkan.Dockerfile b/.devops/llama-cli-vulkan.Dockerfile deleted file mode 100644 index 92a6e047934..00000000000 --- a/.devops/llama-cli-vulkan.Dockerfile +++ /dev/null @@ -1,27 +0,0 @@ -ARG UBUNTU_VERSION=jammy - -FROM ubuntu:$UBUNTU_VERSION AS build - -# Install build tools -RUN apt update && apt install -y git build-essential cmake wget libgomp1 - -# Install Vulkan SDK -RUN wget -qO - https://packages.lunarg.com/lunarg-signing-key-pub.asc | apt-key add - && \ - wget -qO /etc/apt/sources.list.d/lunarg-vulkan-jammy.list https://packages.lunarg.com/vulkan/lunarg-vulkan-jammy.list && \ - apt update -y && \ - apt-get install -y vulkan-sdk - -# Build it -WORKDIR /app -COPY . . -RUN cmake -B build -DGGML_NATIVE=OFF -DGGML_VULKAN=1 && \ - cmake --build build --config Release --target llama-cli - -# Clean up -WORKDIR / -RUN cp /app/build/bin/llama-cli /llama-cli && \ - rm -rf /app - -ENV LC_ALL=C.utf8 - -ENTRYPOINT [ "/llama-cli" ] diff --git a/.devops/llama-cli.Dockerfile b/.devops/llama-cli.Dockerfile deleted file mode 100644 index be234d55dce..00000000000 --- a/.devops/llama-cli.Dockerfile +++ /dev/null @@ -1,29 +0,0 @@ -ARG UBUNTU_VERSION=22.04 - -FROM ubuntu:$UBUNTU_VERSION AS build - -RUN apt-get update && \ - apt-get install -y build-essential git cmake libcurl4-openssl-dev - -WORKDIR /app - -COPY . . - -RUN cmake -S . -B build -DGGML_BACKEND_DL=ON -DGGML_NATIVE=OFF -DGGML_CPU_ALL_VARIANTS=ON -DLLAMA_CURL=ON -DCMAKE_BUILD_TYPE=Release && \ - cmake --build build -j $(nproc) && \ - mkdir -p /app/lib && \ - find build -name "*.so" -exec cp {} /app/lib/ \; - -FROM ubuntu:$UBUNTU_VERSION AS runtime - -WORKDIR /app - -RUN apt-get update && \ - apt-get install -y libcurl4-openssl-dev libgomp1 curl - -COPY --from=build /app/build/bin/llama-cli /app/ -COPY --from=build /app/lib/ /app/ - -ENV LC_ALL=C.utf8 - -ENTRYPOINT [ "/app/llama-cli" ] diff --git a/.devops/llama-server-cuda.Dockerfile b/.devops/llama-server-cuda.Dockerfile deleted file mode 100644 index bf8a198f99f..00000000000 --- a/.devops/llama-server-cuda.Dockerfile +++ /dev/null @@ -1,43 +0,0 @@ -ARG UBUNTU_VERSION=22.04 -# This needs to generally match the container host's environment. -ARG CUDA_VERSION=12.6.0 -# Target the CUDA build image -ARG BASE_CUDA_DEV_CONTAINER=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VERSION} -# Target the CUDA runtime image -ARG BASE_CUDA_RUN_CONTAINER=nvidia/cuda:${CUDA_VERSION}-runtime-ubuntu${UBUNTU_VERSION} - -FROM ${BASE_CUDA_DEV_CONTAINER} AS build - -# CUDA architecture to build for (defaults to all supported archs) -ARG CUDA_DOCKER_ARCH=default - -RUN apt-get update && \ - apt-get install -y build-essential git cmake libcurl4-openssl-dev - -WORKDIR /app - -COPY . . - -# Use the default CUDA archs if not specified -RUN if [ "${CUDA_DOCKER_ARCH}" != "default" ]; then \ - export CMAKE_ARGS="-DCMAKE_CUDA_ARCHITECTURES=${CUDA_DOCKER_ARCH}"; \ - fi && \ - cmake -B build -DGGML_NATIVE=OFF -DGGML_CUDA=ON -DLLAMA_CURL=ON ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \ - cmake --build build --config Release --target llama-server -j$(nproc) && \ - mkdir -p /app/lib && \ - find build -name "*.so" -exec cp {} /app/lib \; - -FROM ${BASE_CUDA_RUN_CONTAINER} AS runtime - -RUN apt-get update && \ - apt-get install -y libcurl4-openssl-dev libgomp1 curl - -COPY --from=build /app/lib/ / -COPY --from=build /app/build/bin/llama-server /llama-server - -# Must be set to 0.0.0.0 so it can listen to requests from host machine -ENV LLAMA_ARG_HOST=0.0.0.0 - -HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ] - -ENTRYPOINT [ "/llama-server" ] diff --git a/.devops/llama-server-intel.Dockerfile b/.devops/llama-server-intel.Dockerfile deleted file mode 100644 index b503b8cfe10..00000000000 --- a/.devops/llama-server-intel.Dockerfile +++ /dev/null @@ -1,34 +0,0 @@ -ARG ONEAPI_VERSION=2025.0.0-0-devel-ubuntu22.04 - -FROM intel/oneapi-basekit:$ONEAPI_VERSION AS build - -ARG GGML_SYCL_F16=OFF -RUN apt-get update && \ - apt-get install -y git libcurl4-openssl-dev - -WORKDIR /app - -COPY . . - -RUN if [ "${GGML_SYCL_F16}" = "ON" ]; then \ - echo "GGML_SYCL_F16 is set" && \ - export OPT_SYCL_F16="-DGGML_SYCL_F16=ON"; \ - fi && \ - echo "Building with dynamic libs" && \ - cmake -B build -DGGML_NATIVE=OFF -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_CURL=ON ${OPT_SYCL_F16} && \ - cmake --build build --config Release --target llama-server - -FROM intel/oneapi-basekit:$ONEAPI_VERSION AS runtime - -RUN apt-get update && \ - apt-get install -y libcurl4-openssl-dev curl - -COPY --from=build /app/build/bin/llama-server /llama-server - -ENV LC_ALL=C.utf8 -# Must be set to 0.0.0.0 so it can listen to requests from host machine -ENV LLAMA_ARG_HOST=0.0.0.0 - -HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ] - -ENTRYPOINT [ "/llama-server" ] diff --git a/.devops/llama-server-musa.Dockerfile b/.devops/llama-server-musa.Dockerfile deleted file mode 100644 index cebe51d42fa..00000000000 --- a/.devops/llama-server-musa.Dockerfile +++ /dev/null @@ -1,43 +0,0 @@ -ARG UBUNTU_VERSION=22.04 -# This needs to generally match the container host's environment. -ARG MUSA_VERSION=rc3.1.0 -# Target the MUSA build image -ARG BASE_MUSA_DEV_CONTAINER=mthreads/musa:${MUSA_VERSION}-devel-ubuntu${UBUNTU_VERSION} -# Target the MUSA runtime image -ARG BASE_MUSA_RUN_CONTAINER=mthreads/musa:${MUSA_VERSION}-runtime-ubuntu${UBUNTU_VERSION} - -FROM ${BASE_MUSA_DEV_CONTAINER} AS build - -# MUSA architecture to build for (defaults to all supported archs) -ARG MUSA_DOCKER_ARCH=default - -RUN apt-get update && \ - apt-get install -y build-essential git cmake libcurl4-openssl-dev - -WORKDIR /app - -COPY . . - -# Use the default MUSA archs if not specified -RUN if [ "${MUSA_DOCKER_ARCH}" != "default" ]; then \ - export CMAKE_ARGS="-DMUSA_ARCHITECTURES=${MUSA_DOCKER_ARCH}"; \ - fi && \ - cmake -B build -DGGML_NATIVE=OFF -DGGML_MUSA=ON -DLLAMA_CURL=ON ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \ - cmake --build build --config Release --target llama-server -j$(nproc) && \ - mkdir -p /app/lib && \ - find build -name "*.so" -exec cp {} /app/lib \; - -FROM ${BASE_MUSA_RUN_CONTAINER} AS runtime - -RUN apt-get update && \ - apt-get install -y libcurl4-openssl-dev libgomp1 curl - -COPY --from=build /app/lib/ / -COPY --from=build /app/build/bin/llama-server /llama-server - -# Must be set to 0.0.0.0 so it can listen to requests from host machine -ENV LLAMA_ARG_HOST=0.0.0.0 - -HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ] - -ENTRYPOINT [ "/llama-server" ] diff --git a/.devops/llama-server-rocm.Dockerfile b/.devops/llama-server-rocm.Dockerfile deleted file mode 100644 index 8553af75b61..00000000000 --- a/.devops/llama-server-rocm.Dockerfile +++ /dev/null @@ -1,54 +0,0 @@ -ARG UBUNTU_VERSION=22.04 - -# This needs to generally match the container host's environment. -ARG ROCM_VERSION=5.6 - -# Target the CUDA build image -ARG BASE_ROCM_DEV_CONTAINER=rocm/dev-ubuntu-${UBUNTU_VERSION}:${ROCM_VERSION}-complete - -FROM ${BASE_ROCM_DEV_CONTAINER} AS build - -# Unless otherwise specified, we make a fat build. -# List from https://github.com/ggerganov/llama.cpp/pull/1087#issuecomment-1682807878 -# This is mostly tied to rocBLAS supported archs. -ARG ROCM_DOCKER_ARCH="\ - gfx803 \ - gfx900 \ - gfx906 \ - gfx908 \ - gfx90a \ - gfx1010 \ - gfx1030 \ - gfx1100 \ - gfx1101 \ - gfx1102" - -COPY requirements.txt requirements.txt -COPY requirements requirements - -RUN pip install --upgrade pip setuptools wheel \ - && pip install -r requirements.txt - -WORKDIR /app - -COPY . . - -# Set nvcc architecture -ENV AMDGPU_TARGETS=${ROCM_DOCKER_ARCH} -# Enable ROCm -ENV GGML_HIPBLAS=1 -ENV CC=/opt/rocm/llvm/bin/clang -ENV CXX=/opt/rocm/llvm/bin/clang++ -# Must be set to 0.0.0.0 so it can listen to requests from host machine -ENV LLAMA_ARG_HOST=0.0.0.0 - -# Enable cURL -ENV LLAMA_CURL=1 -RUN apt-get update && \ - apt-get install -y libcurl4-openssl-dev curl - -RUN make -j$(nproc) llama-server - -HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ] - -ENTRYPOINT [ "/app/llama-server" ] diff --git a/.devops/llama-server-vulkan.Dockerfile b/.devops/llama-server-vulkan.Dockerfile deleted file mode 100644 index 6aa7867791a..00000000000 --- a/.devops/llama-server-vulkan.Dockerfile +++ /dev/null @@ -1,31 +0,0 @@ -ARG UBUNTU_VERSION=jammy - -FROM ubuntu:$UBUNTU_VERSION AS build - -# Install build tools -RUN apt update && apt install -y git build-essential cmake wget - -# Install Vulkan SDK and cURL -RUN wget -qO - https://packages.lunarg.com/lunarg-signing-key-pub.asc | apt-key add - && \ - wget -qO /etc/apt/sources.list.d/lunarg-vulkan-jammy.list https://packages.lunarg.com/vulkan/lunarg-vulkan-jammy.list && \ - apt update -y && \ - apt-get install -y vulkan-sdk libcurl4-openssl-dev curl - -# Build it -WORKDIR /app -COPY . . -RUN cmake -B build -DGGML_NATIVE=OFF -DGGML_VULKAN=1 -DLLAMA_CURL=1 && \ - cmake --build build --config Release --target llama-server - -# Clean up -WORKDIR / -RUN cp /app/build/bin/llama-server /llama-server && \ - rm -rf /app - -ENV LC_ALL=C.utf8 -# Must be set to 0.0.0.0 so it can listen to requests from host machine -ENV LLAMA_ARG_HOST=0.0.0.0 - -HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ] - -ENTRYPOINT [ "/llama-server" ] diff --git a/.devops/llama-server.Dockerfile b/.devops/llama-server.Dockerfile deleted file mode 100644 index 72ccde2feae..00000000000 --- a/.devops/llama-server.Dockerfile +++ /dev/null @@ -1,33 +0,0 @@ -ARG UBUNTU_VERSION=22.04 - -FROM ubuntu:$UBUNTU_VERSION AS build - -RUN apt-get update && \ - apt-get install -y build-essential git cmake libcurl4-openssl-dev - -WORKDIR /app - -COPY . . - -RUN cmake -S . -B build -DGGML_BACKEND_DL=ON -DGGML_NATIVE=OFF -DGGML_CPU_ALL_VARIANTS=ON -DLLAMA_CURL=ON -DCMAKE_BUILD_TYPE=Release && \ - cmake --build build -j $(nproc) && \ - mkdir -p /app/lib && \ - find build -name "*.so" -exec cp {} /app/lib/ \; - -FROM ubuntu:$UBUNTU_VERSION AS runtime - -WORKDIR /app - -RUN apt-get update && \ - apt-get install -y libcurl4-openssl-dev libgomp1 curl - -COPY --from=build /app/build/bin/llama-server /app/ -COPY --from=build /app/lib/ /app/ - -ENV LC_ALL=C.utf8 -# Must be set to 0.0.0.0 so it can listen to requests from host machine -ENV LLAMA_ARG_HOST=0.0.0.0 - -HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ] - -ENTRYPOINT [ "/app/llama-server" ] diff --git a/.devops/musa.Dockerfile b/.devops/musa.Dockerfile new file mode 100644 index 00000000000..bfd7fc1c174 --- /dev/null +++ b/.devops/musa.Dockerfile @@ -0,0 +1,108 @@ +ARG UBUNTU_VERSION=22.04 +# This needs to generally match the container host's environment. +ARG MUSA_VERSION=rc3.1.0 +# Target the MUSA build image +ARG BASE_MUSA_DEV_CONTAINER=mthreads/musa:${MUSA_VERSION}-devel-ubuntu${UBUNTU_VERSION} + +ARG BASE_MUSA_RUN_CONTAINER=mthreads/musa:${MUSA_VERSION}-runtime-ubuntu${UBUNTU_VERSION} + +FROM ${BASE_MUSA_DEV_CONTAINER} AS build + +# MUSA architecture to build for (defaults to all supported archs) +ARG MUSA_DOCKER_ARCH=default + +RUN apt-get update && \ + apt-get install -y \ + build-essential \ + cmake \ + python3 \ + python3-pip \ + git \ + libcurl4-openssl-dev \ + libgomp1 + +COPY requirements.txt requirements.txt +COPY requirements requirements + +RUN pip install --upgrade pip setuptools wheel \ + && pip install -r requirements.txt + +WORKDIR /app + +COPY . . + +# Use the default MUSA archs if not specified +RUN if [ "${MUSA_DOCKER_ARCH}" != "default" ]; then \ + export CMAKE_ARGS="-DMUSA_ARCHITECTURES=${MUSA_DOCKER_ARCH}"; \ + fi && \ + cmake -B build -DGGML_NATIVE=OFF -DGGML_MUSA=ON -DLLAMA_CURL=ON ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \ + cmake --build build --config Release -j$(nproc) + +RUN mkdir -p /app/lib && \ + find build -name "*.so" -exec cp {} /app/lib \; + +RUN mkdir -p /app/full \ + && cp build/bin/* /app/full \ + && cp *.py /app/full \ + && cp -r gguf-py /app/full \ + && cp -r requirements /app/full \ + && cp requirements.txt /app/full \ + && cp .devops/tools.sh /app/full/tools.sh + +## Base image +FROM ${BASE_MUSA_RUN_CONTAINER} AS base + +RUN apt-get update \ + && apt-get install -y libgomp1 curl\ + && apt autoremove -y \ + && apt clean -y \ + && rm -rf /tmp/* /var/tmp/* \ + && find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \ + && find /var/cache -type f -delete + +COPY --from=build /app/lib/ /app + +### Full +FROM base AS full + +COPY --from=build /app/full /app + +WORKDIR /app + +RUN apt-get update \ + && apt-get install -y \ + git \ + python3 \ + python3-pip \ + && pip install --upgrade pip setuptools wheel \ + && pip install -r requirements.txt \ + && apt autoremove -y \ + && apt clean -y \ + && rm -rf /tmp/* /var/tmp/* \ + && find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \ + && find /var/cache -type f -delete + + +ENTRYPOINT ["/app/tools.sh"] + +### Light, CLI only +FROM base AS light + +COPY --from=build /app/full/llama-cli /app + +WORKDIR /app + +ENTRYPOINT [ "/app/llama-cli" ] + +### Server, Server only +FROM base AS server + +ENV LLAMA_ARG_HOST=0.0.0.0 + +COPY --from=build /app/full/llama-server /app + +WORKDIR /app + +HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ] + +ENTRYPOINT [ "/app/llama-server" ] diff --git a/.devops/nix/package.nix b/.devops/nix/package.nix index b88e6ca809f..043c4364b95 100644 --- a/.devops/nix/package.nix +++ b/.devops/nix/package.nix @@ -31,6 +31,7 @@ # Increases the runtime closure size by ~700M useMpi ? false, useRocm ? config.rocmSupport, + rocmGpuTargets ? builtins.concatStringsSep ";" rocmPackages.clr.gpuTargets, enableCurl ? true, useVulkan ? false, llamaVersion ? "0.0.0", # Arbitrary version, substituted by the flake @@ -188,7 +189,7 @@ effectiveStdenv.mkDerivation (finalAttrs: { ] ++ optionals useRocm [ (cmakeFeature "CMAKE_HIP_COMPILER" "${rocmPackages.llvm.clang}/bin/clang") - (cmakeFeature "CMAKE_HIP_ARCHITECTURES" (builtins.concatStringsSep ";" rocmPackages.clr.gpuTargets)) + (cmakeFeature "CMAKE_HIP_ARCHITECTURES" rocmGpuTargets) ] ++ optionals useMetalKit [ (lib.cmakeFeature "CMAKE_C_FLAGS" "-D__ARM_FEATURE_DOTPROD=1") diff --git a/.devops/rocm.Dockerfile b/.devops/rocm.Dockerfile new file mode 100644 index 00000000000..a8088ea00da --- /dev/null +++ b/.devops/rocm.Dockerfile @@ -0,0 +1,113 @@ +ARG UBUNTU_VERSION=24.04 + +# This needs to generally match the container host's environment. +ARG ROCM_VERSION=6.3 +ARG AMDGPU_VERSION=6.3 + +# Target the CUDA build image +ARG BASE_ROCM_DEV_CONTAINER=rocm/dev-ubuntu-${UBUNTU_VERSION}:${ROCM_VERSION}-complete + +### Build image +FROM ${BASE_ROCM_DEV_CONTAINER} AS build + +# Unless otherwise specified, we make a fat build. +# List from https://github.com/ggerganov/llama.cpp/pull/1087#issuecomment-1682807878 +# This is mostly tied to rocBLAS supported archs. +# gfx803, gfx900, gfx1032, gfx1101, gfx1102,not officialy supported +# gfx906 is deprecated +#check https://rocm.docs.amd.com/projects/install-on-linux/en/docs-6.2.4/reference/system-requirements.html + +#ARG ROCM_DOCKER_ARCH='gfx803,gfx900,gfx906,gfx908,gfx90a,gfx942,gfx1010,gfx1030,gfx1032,gfx1100,gfx1101,gfx1102' +ARG ROCM_DOCKER_ARCH=gfx1100 + +# Set nvcc architectured +ENV AMDGPU_TARGETS=${ROCM_DOCKER_ARCH} +# Enable ROCm +# ENV CC=/opt/rocm/llvm/bin/clang +# ENV CXX=/opt/rocm/llvm/bin/clang++ + +RUN apt-get update \ + && apt-get install -y \ + build-essential \ + cmake \ + git \ + libcurl4-openssl-dev \ + curl \ + libgomp1 + +WORKDIR /app + +COPY . . + +RUN HIPCXX="$(hipconfig -l)/clang" HIP_PATH="$(hipconfig -R)" \ + cmake -S . -B build -DGGML_HIP=ON -DAMDGPU_TARGETS=$ROCM_DOCKER_ARCH -DCMAKE_BUILD_TYPE=Release -DLLAMA_CURL=ON \ + && cmake --build build --config Release -j$(nproc) + +RUN mkdir -p /app/lib \ + && find build -name "*.so" -exec cp {} /app/lib \; + +RUN mkdir -p /app/full \ + && cp build/bin/* /app/full \ + && cp *.py /app/full \ + && cp -r gguf-py /app/full \ + && cp -r requirements /app/full \ + && cp requirements.txt /app/full \ + && cp .devops/tools.sh /app/full/tools.sh + +## Base image +FROM ${BASE_ROCM_DEV_CONTAINER} AS base + +RUN apt-get update \ + && apt-get install -y libgomp1 curl\ + && apt autoremove -y \ + && apt clean -y \ + && rm -rf /tmp/* /var/tmp/* \ + && find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \ + && find /var/cache -type f -delete + +COPY --from=build /app/lib/ /app + +### Full +FROM base AS full + +COPY --from=build /app/full /app + +WORKDIR /app + +RUN apt-get update \ + && apt-get install -y \ + git \ + python3-pip \ + python3 \ + python3-wheel\ + && pip install --break-system-packages --upgrade setuptools \ + && pip install --break-system-packages -r requirements.txt \ + && apt autoremove -y \ + && apt clean -y \ + && rm -rf /tmp/* /var/tmp/* \ + && find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \ + && find /var/cache -type f -delete + +ENTRYPOINT ["/app/tools.sh"] + +### Light, CLI only +FROM base AS light + +COPY --from=build /app/full/llama-cli /app + +WORKDIR /app + +ENTRYPOINT [ "/app/llama-cli" ] + +### Server, Server only +FROM base AS server + +ENV LLAMA_ARG_HOST=0.0.0.0 + +COPY --from=build /app/full/llama-server /app + +WORKDIR /app + +HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ] + +ENTRYPOINT [ "/app/llama-server" ] diff --git a/.devops/tools.sh b/.devops/tools.sh index 24dcfd35079..9a86e6ea018 100755 --- a/.devops/tools.sh +++ b/.devops/tools.sh @@ -8,11 +8,11 @@ arg1="$1" shift if [[ "$arg1" == '--convert' || "$arg1" == '-c' ]]; then - python3 ./convert_hf_to_gguf.py "$@" + exec python3 ./convert_hf_to_gguf.py "$@" elif [[ "$arg1" == '--quantize' || "$arg1" == '-q' ]]; then - ./llama-quantize "$@" + exec ./llama-quantize "$@" elif [[ "$arg1" == '--run' || "$arg1" == '-r' ]]; then - ./llama-cli "$@" + exec ./llama-cli "$@" elif [[ "$arg1" == '--all-in-one' || "$arg1" == '-a' ]]; then echo "Converting PTH to GGML..." for i in `ls $1/$2/ggml-model-f16.bin*`; do @@ -20,11 +20,11 @@ elif [[ "$arg1" == '--all-in-one' || "$arg1" == '-a' ]]; then echo "Skip model quantization, it already exists: ${i/f16/q4_0}" else echo "Converting PTH to GGML: $i into ${i/f16/q4_0}..." - ./llama-quantize "$i" "${i/f16/q4_0}" q4_0 + exec ./llama-quantize "$i" "${i/f16/q4_0}" q4_0 fi done elif [[ "$arg1" == '--server' || "$arg1" == '-s' ]]; then - ./llama-server "$@" + exec ./llama-server "$@" else echo "Unknown command: $arg1" echo "Available commands: " diff --git a/.devops/vulkan.Dockerfile b/.devops/vulkan.Dockerfile new file mode 100644 index 00000000000..cfc2162e38b --- /dev/null +++ b/.devops/vulkan.Dockerfile @@ -0,0 +1,88 @@ +ARG UBUNTU_VERSION=jammy + +FROM ubuntu:$UBUNTU_VERSION AS build + +# Install build tools +RUN apt update && apt install -y git build-essential cmake wget + +# Install Vulkan SDK and cURL +RUN wget -qO - https://packages.lunarg.com/lunarg-signing-key-pub.asc | apt-key add - && \ + wget -qO /etc/apt/sources.list.d/lunarg-vulkan-jammy.list https://packages.lunarg.com/vulkan/lunarg-vulkan-jammy.list && \ + apt update -y && \ + apt-get install -y vulkan-sdk libcurl4-openssl-dev curl + +# Build it +WORKDIR /app + +COPY . . + +RUN cmake -B build -DGGML_NATIVE=OFF -DGGML_VULKAN=1 -DLLAMA_CURL=1 && \ + cmake --build build --config Release -j$(nproc) + +RUN mkdir -p /app/lib && \ + find build -name "*.so" -exec cp {} /app/lib \; + +RUN mkdir -p /app/full \ + && cp build/bin/* /app/full \ + && cp *.py /app/full \ + && cp -r gguf-py /app/full \ + && cp -r requirements /app/full \ + && cp requirements.txt /app/full \ + && cp .devops/tools.sh /app/full/tools.sh + +## Base image +FROM ubuntu:$UBUNTU_VERSION AS base + +RUN apt-get update \ + && apt-get install -y libgomp1 curl\ + && apt autoremove -y \ + && apt clean -y \ + && rm -rf /tmp/* /var/tmp/* \ + && find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \ + && find /var/cache -type f -delete + +COPY --from=build /app/lib/ /app + +### Full +FROM base AS full + +COPY --from=build /app/full /app + +WORKDIR /app + +RUN apt-get update \ + && apt-get install -y \ + git \ + python3 \ + python3-pip \ + && pip install --upgrade pip setuptools wheel \ + && pip install -r requirements.txt \ + && apt autoremove -y \ + && apt clean -y \ + && rm -rf /tmp/* /var/tmp/* \ + && find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \ + && find /var/cache -type f -delete + +ENTRYPOINT ["/app/tools.sh"] + +### Light, CLI only +FROM base AS light + +COPY --from=build /app/full/llama-cli /app + +WORKDIR /app + +ENTRYPOINT [ "/app/llama-cli" ] + +### Server, Server only +FROM base AS server + +ENV LLAMA_ARG_HOST=0.0.0.0 + +COPY --from=build /app/full/llama-server /app + +WORKDIR /app + +HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ] + +ENTRYPOINT [ "/app/llama-server" ] diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 886d33d2d56..a377eff38fb 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -317,7 +317,7 @@ jobs: wget -qO - https://packages.lunarg.com/lunarg-signing-key-pub.asc | sudo apt-key add - sudo wget -qO /etc/apt/sources.list.d/lunarg-vulkan-jammy.list https://packages.lunarg.com/vulkan/lunarg-vulkan-jammy.list sudo apt-get update -y - sudo apt-get install -y build-essential vulkan-sdk + sudo apt-get install -y build-essential mesa-vulkan-drivers vulkan-sdk - name: Build id: cmake_build @@ -327,6 +327,12 @@ jobs: cmake -DGGML_VULKAN=ON .. cmake --build . --config Release -j $(nproc) + - name: Test + id: cmake_test + run: | + cd build + ctest -L main --verbose --timeout 900 + ubuntu-22-cmake-hip: runs-on: ubuntu-22.04 container: rocm/dev-ubuntu-22.04:6.0.2 @@ -662,6 +668,8 @@ jobs: defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-llvm.cmake -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON' - build: 'msvc-arm64' defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-msvc.cmake -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON' + - build: 'llvm-arm64-opencl-adreno' + defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-llvm.cmake -DCMAKE_PREFIX_PATH="$env:RUNNER_TEMP/opencl-arm64-release" -DGGML_OPENCL=ON -DGGML_OPENCL_USE_ADRENO_KERNELS=ON' steps: - name: Clone @@ -703,6 +711,28 @@ jobs: run: | choco install ninja + - name: Install OpenCL Headers and Libs + id: install_opencl + if: ${{ matrix.build == 'llvm-arm64-opencl-adreno' }} + run: | + git clone https://github.com/KhronosGroup/OpenCL-Headers + cd OpenCL-Headers + mkdir build && cd build + cmake .. ` + -DBUILD_TESTING=OFF ` + -DOPENCL_HEADERS_BUILD_TESTING=OFF ` + -DOPENCL_HEADERS_BUILD_CXX_TESTS=OFF ` + -DCMAKE_INSTALL_PREFIX="$env:RUNNER_TEMP/opencl-arm64-release" + cmake --build . --target install + git clone https://github.com/KhronosGroup/OpenCL-ICD-Loader + cd OpenCL-ICD-Loader + mkdir build-arm64-release && cd build-arm64-release + cmake .. ` + -A arm64 ` + -DCMAKE_PREFIX_PATH="$env:RUNNER_TEMP/opencl-arm64-release" ` + -DCMAKE_INSTALL_PREFIX="$env:RUNNER_TEMP/opencl-arm64-release" + cmake --build . --target install --config release + - name: Build id: cmake_build run: | @@ -732,7 +762,7 @@ jobs: - name: Test id: cmake_test # not all machines have native AVX-512 - if: ${{ matrix.build != 'msvc-arm64' && matrix.build != 'llvm-arm64' && matrix.build != 'kompute-x64' && matrix.build != 'vulkan-x64' && (matrix.build != 'avx512-x64' || env.HAS_AVX512F == '1') }} + if: ${{ matrix.build != 'msvc-arm64' && matrix.build != 'llvm-arm64' && matrix.build != 'llvm-arm64-opencl-adreno' && matrix.build != 'kompute-x64' && matrix.build != 'vulkan-x64' && (matrix.build != 'avx512-x64' || env.HAS_AVX512F == '1') }} run: | cd build ctest -L main -C Release --verbose --timeout 900 diff --git a/.github/workflows/docker.yml b/.github/workflows/docker.yml index bc2e5020de2..41f1a89eebb 100644 --- a/.github/workflows/docker.yml +++ b/.github/workflows/docker.yml @@ -34,21 +34,14 @@ jobs: strategy: matrix: config: - - { tag: "light", dockerfile: ".devops/llama-cli.Dockerfile", platforms: "linux/amd64,linux/arm64" } - - { tag: "server", dockerfile: ".devops/llama-server.Dockerfile", platforms: "linux/amd64,linux/arm64" } - - { tag: "full", dockerfile: ".devops/full.Dockerfile", platforms: "linux/amd64,linux/arm64" } - - { tag: "light-cuda", dockerfile: ".devops/llama-cli-cuda.Dockerfile", platforms: "linux/amd64" } - - { tag: "server-cuda", dockerfile: ".devops/llama-server-cuda.Dockerfile", platforms: "linux/amd64" } - - { tag: "full-cuda", dockerfile: ".devops/full-cuda.Dockerfile", platforms: "linux/amd64" } - - { tag: "light-musa", dockerfile: ".devops/llama-cli-musa.Dockerfile", platforms: "linux/amd64" } - - { tag: "server-musa", dockerfile: ".devops/llama-server-musa.Dockerfile", platforms: "linux/amd64" } - - { tag: "full-musa", dockerfile: ".devops/full-musa.Dockerfile", platforms: "linux/amd64" } + # Multi-stage build + - { tag: "cpu", dockerfile: ".devops/cpu.Dockerfile", platforms: "linux/amd64,linux/arm64", full: true, light: true, server: true, freediskspace: false} + - { tag: "cuda", dockerfile: ".devops/cuda.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, freediskspace: false} + - { tag: "musa", dockerfile: ".devops/musa.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, freediskspace: false} + - { tag: "intel", dockerfile: ".devops/intel.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, freediskspace: false} + - { tag: "vulkan", dockerfile: ".devops/vulkan.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, freediskspace: false} # Note: the rocm images are failing due to a compiler error and are disabled until this is fixed to allow the workflow to complete - #- { tag: "light-rocm", dockerfile: ".devops/llama-cli-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" } - #- { tag: "server-rocm", dockerfile: ".devops/llama-server-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" } - #- { tag: "full-rocm", dockerfile: ".devops/full-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" } - - { tag: "light-intel", dockerfile: ".devops/llama-cli-intel.Dockerfile", platforms: "linux/amd64" } - - { tag: "server-intel", dockerfile: ".devops/llama-server-intel.Dockerfile", platforms: "linux/amd64" } + #- {tag: "rocm", dockerfile: ".devops/rocm.Dockerfile", platforms: "linux/amd64,linux/arm64", full: true, light: true, server: true, freediskspace: true } steps: - name: Check out the repo uses: actions/checkout@v4 @@ -56,10 +49,10 @@ jobs: fetch-depth: 0 # preserve git history, so we can determine the build number - name: Set up QEMU - uses: docker/setup-qemu-action@v2 + uses: docker/setup-qemu-action@v3 - name: Set up Docker Buildx - uses: docker/setup-buildx-action@v2 + uses: docker/setup-buildx-action@v3 - name: Log in to Docker Hub uses: docker/login-action@v2 @@ -79,25 +72,34 @@ jobs: # determine tag name postfix (build number, commit hash) if [[ "${{ env.GITHUB_BRANCH_NAME }}" == "master" ]]; then - TAG_POSTFIX="b${BUILD_NUMBER}" + TAG_POSTFIX="-b${BUILD_NUMBER}" else SAFE_NAME=$(echo "${{ env.GITHUB_BRANCH_NAME }}" | tr '/' '-') - TAG_POSTFIX="${SAFE_NAME}-${SHORT_HASH}" + TAG_POSTFIX="-${SAFE_NAME}-${SHORT_HASH}" fi - # list all tags possible - TAGS="" - TAGS="${TAGS}ghcr.io/${REPO_OWNER}/${REPO_NAME}:${{ matrix.config.tag }}," - TAGS="${TAGS}ghcr.io/${REPO_OWNER}/${REPO_NAME}:${{ matrix.config.tag }}-${TAG_POSTFIX}" - - echo "output_tags=$TAGS" >> $GITHUB_OUTPUT - echo "output_tags=$TAGS" # print out for debugging + if [[ "${{ matrix.config.tag }}" == "cpu" ]]; then + TYPE="" + else + TYPE="-${{ matrix.config.tag }}" + fi + PREFIX="ghcr.io/${REPO_OWNER}/${REPO_NAME}:" + FULLTAGS="${PREFIX}full${TYPE},${PREFIX}full${TYPE}${TAG_POSTFIX}" + LIGHTTAGS="${PREFIX}light${TYPE},${PREFIX}light${TYPE}${TAG_POSTFIX}" + SERVERTAGS="${PREFIX}server${TYPE},${PREFIX}server${TYPE}${TAG_POSTFIX}" + echo "full_output_tags=$FULLTAGS" >> $GITHUB_OUTPUT + echo "light_output_tags=$LIGHTTAGS" >> $GITHUB_OUTPUT + echo "server_output_tags=$SERVERTAGS" >> $GITHUB_OUTPUT + echo "full_output_tags=$FULLTAGS" # print out for debugging + echo "light_output_tags=$LIGHTTAGS" # print out for debugging + echo "server_output_tags=$SERVERTAGS" # print out for debugging env: GITHUB_BRANCH_NAME: ${{ github.head_ref || github.ref_name }} GITHUB_REPOSITORY_OWNER: '${{ github.repository_owner }}' # https://github.com/jlumbroso/free-disk-space/tree/54081f138730dfa15788a46383842cd2f914a1be#example - name: Free Disk Space (Ubuntu) + if: ${{ matrix.config.free_disk_space == true }} uses: jlumbroso/free-disk-space@main with: # this might remove tools that are actually needed, @@ -113,13 +115,59 @@ jobs: docker-images: true swap-storage: true - - name: Build and push Docker image (tagged + versioned) - if: ${{ github.event_name == 'push' || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' }} + - name: Build and push Full Docker image (tagged + versioned) + if: ${{ (github.event_name == 'push' || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch') && matrix.config.full == true }} + uses: docker/build-push-action@v6 + with: + context: . + push: true + platforms: ${{ matrix.config.platforms }} + # tag list is generated from step above + tags: ${{ steps.tag.outputs.full_output_tags }} + file: ${{ matrix.config.dockerfile }} + target: full + provenance: false + # using github experimental cache + cache-from: type=gha + cache-to: type=gha,mode=max + # return to this if the experimental github cache is having issues + #cache-to: type=local,dest=/tmp/.buildx-cache + #cache-from: type=local,src=/tmp/.buildx-cache + + - name: Build and push Light Docker image (tagged + versioned) + if: ${{ (github.event_name == 'push' || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch') && matrix.config.light == true }} + uses: docker/build-push-action@v6 + with: + context: . + push: true + platforms: ${{ matrix.config.platforms }} + # tag list is generated from step above + tags: ${{ steps.tag.outputs.light_output_tags }} + file: ${{ matrix.config.dockerfile }} + target: light + provenance: false + # using github experimental cache + cache-from: type=gha + cache-to: type=gha,mode=max + # return to this if the experimental github cache is having issues + #cache-to: type=local,dest=/tmp/.buildx-cache + #cache-from: type=local,src=/tmp/.buildx-cache + + - name: Build and push Server Docker image (tagged + versioned) + if: ${{ (github.event_name == 'push' || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch') && matrix.config.server == true }} uses: docker/build-push-action@v6 with: context: . push: true platforms: ${{ matrix.config.platforms }} # tag list is generated from step above - tags: ${{ steps.tag.outputs.output_tags }} + tags: ${{ steps.tag.outputs.server_output_tags }} file: ${{ matrix.config.dockerfile }} + target: server + provenance: false + # using github experimental cache + cache-from: type=gha + cache-to: type=gha,mode=max + # return to this if the experimental github cache is having issues + #cache-to: type=local,dest=/tmp/.buildx-cache + #cache-from: type=local,src=/tmp/.buildx-cache diff --git a/.github/workflows/server.yml b/.github/workflows/server.yml index 9e66fb68cff..671fe595cdf 100644 --- a/.github/workflows/server.yml +++ b/.github/workflows/server.yml @@ -79,7 +79,7 @@ jobs: # Setup nodejs (to be used for verifying bundled index.html) - uses: actions/setup-node@v4 with: - node-version: 22 + node-version: '22.11.0' - name: Verify bundled index.html id: verify_server_index_html diff --git a/CODEOWNERS b/CODEOWNERS index 88ab6de4fb6..adeba539583 100644 --- a/CODEOWNERS +++ b/CODEOWNERS @@ -1,3 +1,5 @@ # collaborators can optionally add themselves here to indicate their availability for reviewing related PRs -ci/ @ggerganov +/ci/ @ggerganov +/.devops/ @ngxson +/examples/server/ @ngxson diff --git a/Makefile b/Makefile index bcea450e46b..19ae0d5f1c8 100644 --- a/Makefile +++ b/Makefile @@ -22,6 +22,7 @@ BUILD_TARGETS = \ llama-infill \ llama-llava-cli \ llama-minicpmv-cli\ + llama-qwen2vl-cli\ llama-lookahead \ llama-lookup \ llama-lookup-create \ @@ -1404,6 +1405,14 @@ llama-minicpmv-cli: examples/llava/minicpmv-cli.cpp \ $(OBJ_ALL) $(CXX) $(CXXFLAGS) $< $(filter-out %.h $<,$^) -o $@ $(LDFLAGS) -Wno-cast-qual +llama-qwen2vl-cli: examples/llava/qwen2vl-cli.cpp \ + examples/llava/llava.cpp \ + examples/llava/llava.h \ + examples/llava/clip.cpp \ + examples/llava/clip.h \ + $(OBJ_ALL) + $(CXX) $(CXXFLAGS) $< $(filter-out %.h $<,$^) -o $@ $(LDFLAGS) -Wno-cast-qual + ifeq ($(UNAME_S),Darwin) swift: examples/batched.swift (cd examples/batched.swift; make build) diff --git a/README.md b/README.md index 6fdd8d9eefb..d6d1958c8fc 100644 --- a/README.md +++ b/README.md @@ -98,6 +98,7 @@ Instructions for adding support for new models: [HOWTO-add-model.md](docs/develo - [x] [Jais](https://huggingface.co/inceptionai/jais-13b-chat) - [x] [Bielik-11B-v2.3](https://huggingface.co/collections/speakleash/bielik-11b-v23-66ee813238d9b526a072408a) - [x] [RWKV-6](https://github.com/BlinkDL/RWKV-LM) +- [x] [GigaChat-20B-A3B](https://huggingface.co/ai-sage/GigaChat-20B-A3B-instruct) #### Multimodal @@ -110,6 +111,7 @@ Instructions for adding support for new models: [HOWTO-add-model.md](docs/develo - [x] [Mini CPM](https://huggingface.co/models?search=MiniCPM) - [x] [Moondream](https://huggingface.co/vikhyatk/moondream2) - [x] [Bunny](https://github.com/BAAI-DCAI/Bunny) +- [x] [Qwen2-VL](https://huggingface.co/collections/Qwen/qwen2-vl-66cee7455501d7126940800d) @@ -219,7 +221,7 @@ Instructions for adding support for new models: [HOWTO-add-model.md](docs/develo | [SYCL](docs/backend/SYCL.md) | Intel and Nvidia GPU | | [MUSA](docs/build.md#musa) | Moore Threads MTT GPU | | [CUDA](docs/build.md#cuda) | Nvidia GPU | -| [hipBLAS](docs/build.md#hipblas) | AMD GPU | +| [HIP](docs/build.md#hip) | AMD GPU | | [Vulkan](docs/build.md#vulkan) | GPU | | [CANN](docs/build.md#cann) | Ascend NPU | @@ -412,7 +414,7 @@ To learn more about model quantization, [read this documentation](examples/quant [^1]: [examples/perplexity/README.md](examples/perplexity/README.md) [^2]: [https://huggingface.co/docs/transformers/perplexity](https://huggingface.co/docs/transformers/perplexity) -## [`llama-bench`](example/bench) +## [`llama-bench`](examples/llama-bench) #### Benchmark the performance of the inference for various parameters. @@ -433,6 +435,20 @@ To learn more about model quantization, [read this documentation](examples/quant +## [`llama-run`](examples/run) + +#### A comprehensive example for running `llama.cpp` models. Useful for inferencing. Used with RamaLama [^3]. + +-
+ Run a model with a specific prompt (by default it's pulled from Ollama registry) + + ```bash + llama-run granite-code + ``` + +
+ +[^3]: [RamaLama](https://github.com/containers/ramalama) ## [`llama-simple`](examples/simple) diff --git a/common/CMakeLists.txt b/common/CMakeLists.txt index 89862fe1139..df1cdf9a59a 100644 --- a/common/CMakeLists.txt +++ b/common/CMakeLists.txt @@ -81,7 +81,7 @@ set(LLAMA_COMMON_EXTRA_LIBS build_info) # Use curl to download model url if (LLAMA_CURL) find_package(CURL REQUIRED) - add_definitions(-DLLAMA_USE_CURL) + target_compile_definitions(${TARGET} PUBLIC LLAMA_USE_CURL) include_directories(${CURL_INCLUDE_DIRS}) find_library(CURL_LIBRARY curl REQUIRED) set(LLAMA_COMMON_EXTRA_LIBS ${LLAMA_COMMON_EXTRA_LIBS} ${CURL_LIBRARY}) diff --git a/common/arg.cpp b/common/arg.cpp index 49af3168251..deb11378657 100644 --- a/common/arg.cpp +++ b/common/arg.cpp @@ -119,32 +119,65 @@ std::string common_arg::to_string() { // utils // -static void common_params_handle_model_default(common_params & params) { - if (!params.hf_repo.empty()) { +static void common_params_handle_model_default( + std::string & model, + std::string & model_url, + std::string & hf_repo, + std::string & hf_file) { + if (!hf_repo.empty()) { // short-hand to avoid specifying --hf-file -> default it to --model - if (params.hf_file.empty()) { - if (params.model.empty()) { + if (hf_file.empty()) { + if (model.empty()) { throw std::invalid_argument("error: --hf-repo requires either --hf-file or --model\n"); } - params.hf_file = params.model; - } else if (params.model.empty()) { + hf_file = model; + } else if (model.empty()) { // this is to avoid different repo having same file name, or same file name in different subdirs - std::string filename = params.hf_repo + "_" + params.hf_file; + std::string filename = hf_repo + "_" + hf_file; // to make sure we don't have any slashes in the filename string_replace_all(filename, "/", "_"); - params.model = fs_get_cache_file(filename); + model = fs_get_cache_file(filename); } - } else if (!params.model_url.empty()) { - if (params.model.empty()) { - auto f = string_split(params.model_url, '#').front(); + } else if (!model_url.empty()) { + if (model.empty()) { + auto f = string_split(model_url, '#').front(); f = string_split(f, '?').front(); - params.model = fs_get_cache_file(string_split(f, '/').back()); + model = fs_get_cache_file(string_split(f, '/').back()); } - } else if (params.model.empty()) { - params.model = DEFAULT_MODEL_PATH; + } else if (model.empty()) { + model = DEFAULT_MODEL_PATH; } } +const std::vector kv_cache_types = { + GGML_TYPE_F32, + GGML_TYPE_F16, + GGML_TYPE_BF16, + GGML_TYPE_Q8_0, + GGML_TYPE_Q4_0, + GGML_TYPE_Q4_1, + GGML_TYPE_IQ4_NL, + GGML_TYPE_Q5_0, + GGML_TYPE_Q5_1, +}; + +static ggml_type kv_cache_type_from_str(const std::string & s) { + for (const auto & type : kv_cache_types) { + if (ggml_type_name(type) == s) { + return type; + } + } + throw std::runtime_error("Unsupported cache type: " + s); +} + +static std::string get_all_kv_cache_types() { + std::ostringstream msg; + for (const auto & type : kv_cache_types) { + msg << ggml_type_name(type) << (&type == &kv_cache_types.back() ? "" : ", "); + } + return msg.str(); +} + // // CLI argument parsing functions // @@ -247,7 +280,9 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context throw std::invalid_argument("error: --prompt-cache-all not supported in interactive mode yet\n"); } - common_params_handle_model_default(params); + // TODO: refactor model params in a common struct + common_params_handle_model_default(params.model, params.model_url, params.hf_repo, params.hf_file); + common_params_handle_model_default(params.vocoder.model, params.vocoder.model_url, params.vocoder.hf_repo, params.vocoder.hf_file); if (params.escape) { string_process_escapes(params.prompt); @@ -591,7 +626,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex [](common_params & params) { params.ctx_shift = false; } - ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_IMATRIX}).set_env("LLAMA_ARG_NO_CONTEXT_SHIFT")); + ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_PERPLEXITY}).set_env("LLAMA_ARG_NO_CONTEXT_SHIFT")); add_opt(common_arg( {"--chunks"}, "N", string_format("max number of chunks to process (default: %d, -1 = all)", params.n_chunks), @@ -813,7 +848,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex } ).set_sparam()); add_opt(common_arg( - {"--sampling-seq"}, "SEQUENCE", + {"--sampling-seq", "--sampler-seq"}, "SEQUENCE", string_format("simplified sequence for samplers that will be used (default: %s)", sampler_type_chars.c_str()), [](common_params & params, const std::string & value) { params.sampling.samplers = common_sampler_types_from_chars(value); @@ -826,13 +861,6 @@ common_params_context common_params_parser_init(common_params & params, llama_ex params.sampling.ignore_eos = true; } ).set_sparam()); - add_opt(common_arg( - {"--penalize-nl"}, - string_format("penalize newline tokens (default: %s)", params.sampling.penalize_nl ? "true" : "false"), - [](common_params & params) { - params.sampling.penalize_nl = true; - } - ).set_sparam()); add_opt(common_arg( {"--temp"}, "N", string_format("temperature (default: %.1f)", (double)params.sampling.temp), @@ -887,6 +915,9 @@ common_params_context common_params_parser_init(common_params & params, llama_ex {"--repeat-last-n"}, "N", string_format("last n tokens to consider for penalize (default: %d, 0 = disabled, -1 = ctx_size)", params.sampling.penalty_last_n), [](common_params & params, int value) { + if (value < -1) { + throw std::runtime_error(string_format("error: invalid repeat-last-n = %d\n", value)); + } params.sampling.penalty_last_n = value; params.sampling.n_prev = std::max(params.sampling.n_prev, params.sampling.penalty_last_n); } @@ -941,6 +972,9 @@ common_params_context common_params_parser_init(common_params & params, llama_ex {"--dry-penalty-last-n"}, "N", string_format("set DRY penalty for the last n tokens (default: %d, 0 = disable, -1 = context size)", params.sampling.dry_penalty_last_n), [](common_params & params, int value) { + if (value < -1) { + throw std::runtime_error(string_format("error: invalid dry-penalty-last-n = %d\n", value)); + } params.sampling.dry_penalty_last_n = value; } ).set_sparam()); @@ -1174,18 +1208,28 @@ common_params_context common_params_parser_init(common_params & params, llama_ex ).set_env("LLAMA_ARG_NO_KV_OFFLOAD")); add_opt(common_arg( {"-ctk", "--cache-type-k"}, "TYPE", - string_format("KV cache data type for K (default: %s)", params.cache_type_k.c_str()), + string_format( + "KV cache data type for K\n" + "allowed values: %s\n" + "(default: %s)", + get_all_kv_cache_types().c_str(), + ggml_type_name(params.cache_type_k) + ), [](common_params & params, const std::string & value) { - // TODO: get the type right here - params.cache_type_k = value; + params.cache_type_k = kv_cache_type_from_str(value); } ).set_env("LLAMA_ARG_CACHE_TYPE_K")); add_opt(common_arg( {"-ctv", "--cache-type-v"}, "TYPE", - string_format("KV cache data type for V (default: %s)", params.cache_type_v.c_str()), + string_format( + "KV cache data type for V\n" + "allowed values: %s\n" + "(default: %s)", + get_all_kv_cache_types().c_str(), + ggml_type_name(params.cache_type_v) + ), [](common_params & params, const std::string & value) { - // TODO: get the type right here - params.cache_type_v = value; + params.cache_type_v = kv_cache_type_from_str(value); } ).set_env("LLAMA_ARG_CACHE_TYPE_V")); add_opt(common_arg( @@ -1543,6 +1587,20 @@ common_params_context common_params_parser_init(common_params & params, llama_ex params.hf_file = value; } ).set_env("LLAMA_ARG_HF_FILE")); + add_opt(common_arg( + {"-hfrv", "--hf-repo-v"}, "REPO", + "Hugging Face model repository for the vocoder model (default: unused)", + [](common_params & params, const std::string & value) { + params.vocoder.hf_repo = value; + } + ).set_env("LLAMA_ARG_HF_REPO_V")); + add_opt(common_arg( + {"-hffv", "--hf-file-v"}, "FILE", + "Hugging Face model file for the vocoder model (default: unused)", + [](common_params & params, const std::string & value) { + params.vocoder.hf_file = value; + } + ).set_env("LLAMA_ARG_HF_FILE_V")); add_opt(common_arg( {"-hft", "--hf-token"}, "TOKEN", "Hugging Face access token (default: value from HF_TOKEN environment variable)", @@ -2083,35 +2141,35 @@ common_params_context common_params_parser_init(common_params & params, llama_ex [](common_params & params, int value) { params.speculative.n_max = value; } - ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_LOOKUP, LLAMA_EXAMPLE_SERVER})); + ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_LOOKUP, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_DRAFT_MAX")); add_opt(common_arg( {"--draft-min", "--draft-n-min"}, "N", string_format("minimum number of draft tokens to use for speculative decoding (default: %d)", params.speculative.n_min), [](common_params & params, int value) { params.speculative.n_min = value; } - ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_LOOKUP, LLAMA_EXAMPLE_SERVER})); + ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_LOOKUP, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_DRAFT_MIN")); add_opt(common_arg( {"--draft-p-split"}, "P", string_format("speculative decoding split probability (default: %.1f)", (double)params.speculative.p_split), [](common_params & params, const std::string & value) { params.speculative.p_split = std::stof(value); } - ).set_examples({LLAMA_EXAMPLE_SPECULATIVE})); + ).set_examples({LLAMA_EXAMPLE_SPECULATIVE}).set_env("LLAMA_ARG_DRAFT_P_SPLIT")); add_opt(common_arg( {"--draft-p-min"}, "P", string_format("minimum speculative decoding probability (greedy) (default: %.1f)", (double)params.speculative.p_min), [](common_params & params, const std::string & value) { params.speculative.p_min = std::stof(value); } - ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER})); + ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_DRAFT_P_MIN")); add_opt(common_arg( {"-cd", "--ctx-size-draft"}, "N", string_format("size of the prompt context for the draft model (default: %d, 0 = loaded from model)", params.speculative.n_ctx), [](common_params & params, int value) { params.speculative.n_ctx = value; } - ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER})); + ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_CTX_SIZE_DRAFT")); add_opt(common_arg( {"-devd", "--device-draft"}, "", "comma-separated list of devices to use for offloading the draft model (none = don't offload)\n" @@ -2131,14 +2189,34 @@ common_params_context common_params_parser_init(common_params & params, llama_ex fprintf(stderr, "warning: consult docs/build.md for compilation instructions\n"); } } - ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER})); + ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_N_GPU_LAYERS_DRAFT")); add_opt(common_arg( {"-md", "--model-draft"}, "FNAME", "draft model for speculative decoding (default: unused)", [](common_params & params, const std::string & value) { params.speculative.model = value; } - ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER})); + ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_MODEL_DRAFT")); + + add_opt(common_arg( + {"-mv", "--model-vocoder"}, "FNAME", + "vocoder model for audio generation (default: unused)", + [](common_params & params, const std::string & value) { + params.vocoder.model = value; + } + ).set_examples({LLAMA_EXAMPLE_TTS, LLAMA_EXAMPLE_SERVER})); + + // model-specific + add_opt(common_arg( + {"--tts-oute-default"}, + string_format("use default OuteTTS models (note: can download weights from the internet)"), + [](common_params & params) { + params.hf_repo = "OuteAI/OuteTTS-0.2-500M-GGUF"; + params.hf_file = "OuteTTS-0.2-500M-Q8_0.gguf"; + params.vocoder.hf_repo = "ggml-org/WavTokenizer"; + params.vocoder.hf_file = "WavTokenizer-Large-75-F16.gguf"; + } + ).set_examples({LLAMA_EXAMPLE_TTS})); return ctx_arg; } diff --git a/common/common.cpp b/common/common.cpp index b1355b835dc..f40e3a289fb 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -940,6 +940,25 @@ struct common_init_result common_init_from_params(common_params & params) { params.sampling.ignore_eos = false; } + if (params.sampling.ignore_eos) { + for (llama_token i = 0; i < llama_n_vocab(model); i++) { + if (llama_token_is_eog(model, i)) { + LOG_INF("%s: added %s logit bias = %f\n", __func__, common_token_to_piece(lctx, i).c_str(), -INFINITY); + params.sampling.logit_bias.push_back({i, -INFINITY}); + } + } + } + + if (params.sampling.penalty_last_n == -1) { + LOG_INF("%s: setting penalty_last_n to ctx_size = %d\n", __func__, llama_n_ctx(lctx)); + params.sampling.penalty_last_n = llama_n_ctx(lctx); + } + + if (params.sampling.dry_penalty_last_n == -1) { + LOG_INF("%s: setting dry_penalty_last_n to ctx_size = %d\n", __func__, llama_n_ctx(lctx)); + params.sampling.dry_penalty_last_n = llama_n_ctx(lctx); + } + if (params.warmup) { LOG_WRN("%s: warming up the model with an empty run - please wait ... (--no-warmup to disable)\n", __func__); @@ -1015,38 +1034,6 @@ struct llama_model_params common_model_params_to_llama(common_params & params) { return mparams; } -static ggml_type kv_cache_type_from_str(const std::string & s) { - if (s == "f32") { - return GGML_TYPE_F32; - } - if (s == "f16") { - return GGML_TYPE_F16; - } - if (s == "bf16") { - return GGML_TYPE_BF16; - } - if (s == "q8_0") { - return GGML_TYPE_Q8_0; - } - if (s == "q4_0") { - return GGML_TYPE_Q4_0; - } - if (s == "q4_1") { - return GGML_TYPE_Q4_1; - } - if (s == "iq4_nl") { - return GGML_TYPE_IQ4_NL; - } - if (s == "q5_0") { - return GGML_TYPE_Q5_0; - } - if (s == "q5_1") { - return GGML_TYPE_Q5_1; - } - - throw std::runtime_error("Unsupported cache type: " + s); -} - struct llama_context_params common_context_params_to_llama(const common_params & params) { auto cparams = llama_context_default_params(); @@ -1081,8 +1068,8 @@ struct llama_context_params common_context_params_to_llama(const common_params & cparams.pooling_type = LLAMA_POOLING_TYPE_RANK; } - cparams.type_k = kv_cache_type_from_str(params.cache_type_k); - cparams.type_v = kv_cache_type_from_str(params.cache_type_v); + cparams.type_k = params.cache_type_k; + cparams.type_v = params.cache_type_v; return cparams; } @@ -1108,13 +1095,7 @@ struct ggml_threadpool_params ggml_threadpool_params_from_cpu_params(const cpu_p #define CURL_MAX_RETRY 3 #define CURL_RETRY_DELAY_SECONDS 2 - -static bool starts_with(const std::string & str, const std::string & prefix) { - // While we wait for C++20's std::string::starts_with... - return str.rfind(prefix, 0) == 0; -} - -static bool curl_perform_with_retry(const std::string& url, CURL* curl, int max_attempts, int retry_delay_seconds) { +static bool curl_perform_with_retry(const std::string & url, CURL * curl, int max_attempts, int retry_delay_seconds) { int remaining_attempts = max_attempts; while (remaining_attempts > 0) { @@ -1138,7 +1119,6 @@ static bool curl_perform_with_retry(const std::string& url, CURL* curl, int max_ } static bool common_download_file(const std::string & url, const std::string & path, const std::string & hf_token) { - // Initialize libcurl std::unique_ptr curl(curl_easy_init(), &curl_easy_cleanup); if (!curl) { @@ -1211,11 +1191,13 @@ static bool common_download_file(const std::string & url, const std::string & pa std::string etag; std::string last_modified; }; + common_load_model_from_url_headers headers; + { typedef size_t(*CURLOPT_HEADERFUNCTION_PTR)(char *, size_t, size_t, void *); auto header_callback = [](char * buffer, size_t /*size*/, size_t n_items, void * userdata) -> size_t { - common_load_model_from_url_headers *headers = (common_load_model_from_url_headers *) userdata; + common_load_model_from_url_headers * headers = (common_load_model_from_url_headers *) userdata; static std::regex header_regex("([^:]+): (.*)\r\n"); static std::regex etag_regex("ETag", std::regex_constants::icase); @@ -1799,7 +1781,9 @@ void common_embd_normalize(const float * inp, float * out, int n, int embd_norm) break; case 0: // max absolute for (int i = 0; i < n; i++) { - if (sum < std::abs(inp[i])) sum = std::abs(inp[i]); + if (sum < std::abs(inp[i])) { + sum = std::abs(inp[i]); + } } sum /= 32760.0; // make an int16 range break; diff --git a/common/common.h b/common/common.h index 86e9def47ba..76f2aed851f 100644 --- a/common/common.h +++ b/common/common.h @@ -37,9 +37,9 @@ using llama_tokens = std::vector; // build info extern int LLAMA_BUILD_NUMBER; -extern char const * LLAMA_COMMIT; -extern char const * LLAMA_COMPILER; -extern char const * LLAMA_BUILD_TARGET; +extern const char * LLAMA_COMMIT; +extern const char * LLAMA_COMPILER; +extern const char * LLAMA_BUILD_TARGET; struct common_control_vector_load_info; @@ -80,6 +80,7 @@ enum llama_example { LLAMA_EXAMPLE_LLAVA, LLAMA_EXAMPLE_LOOKUP, LLAMA_EXAMPLE_PARALLEL, + LLAMA_EXAMPLE_TTS, LLAMA_EXAMPLE_COUNT, }; @@ -95,6 +96,7 @@ enum common_sampler_type { COMMON_SAMPLER_TYPE_TEMPERATURE = 7, COMMON_SAMPLER_TYPE_XTC = 8, COMMON_SAMPLER_TYPE_INFILL = 9, + COMMON_SAMPLER_TYPE_PENALTIES = 10, }; // dimensionality reduction methods, used by cvector-generator @@ -130,7 +132,6 @@ struct common_params_sampling { int32_t mirostat = 0; // 0 = disabled, 1 = mirostat, 2 = mirostat 2.0 float mirostat_tau = 5.00f; // target entropy float mirostat_eta = 0.10f; // learning rate - bool penalize_nl = false; // consider newlines as a repeatable token bool ignore_eos = false; bool no_perf = false; // disable performance metrics bool timing_per_token = false; @@ -139,6 +140,7 @@ struct common_params_sampling { std::vector samplers = { + COMMON_SAMPLER_TYPE_PENALTIES, COMMON_SAMPLER_TYPE_DRY, COMMON_SAMPLER_TYPE_TOP_K, COMMON_SAMPLER_TYPE_TYPICAL_P, @@ -158,6 +160,7 @@ struct common_params_sampling { struct common_params_speculative { std::vector devices; // devices to use for offloading + int32_t n_ctx = 0; // draft context size int32_t n_max = 16; // maximum number of tokens to draft during speculative decoding int32_t n_min = 5; // minimum number of draft tokens to use for speculative decoding @@ -171,6 +174,14 @@ struct common_params_speculative { std::string model = ""; // draft model for speculative decoding // NOLINT }; +struct common_params_vocoder { + std::string hf_repo = ""; // HF repo // NOLINT + std::string hf_file = ""; // HF file // NOLINT + + std::string model = ""; // model path // NOLINT + std::string model_url = ""; // model url to download // NOLINT +}; + struct common_params { int32_t n_predict = -1; // new tokens to predict int32_t n_ctx = 4096; // context size @@ -193,11 +204,13 @@ struct common_params { float defrag_thold = 0.1f; // KV cache defragmentation threshold // offload params - std::vector devices; // devices to use for offloading - int32_t n_gpu_layers = -1; // number of layers to store in VRAM (-1 - use default) - int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors - float tensor_split[128] = {0}; // how split tensors should be distributed across GPUs - enum llama_split_mode split_mode = LLAMA_SPLIT_MODE_LAYER; // how to split the model across GPUs + std::vector devices; // devices to use for offloading + + int32_t n_gpu_layers = -1; // number of layers to store in VRAM (-1 - use default) + int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors + float tensor_split[128] = {0}; // how split tensors should be distributed across GPUs + + enum llama_split_mode split_mode = LLAMA_SPLIT_MODE_LAYER; // how to split the model across GPUs struct cpu_params cpuparams; struct cpu_params cpuparams_batch; @@ -211,8 +224,9 @@ struct common_params { enum llama_pooling_type pooling_type = LLAMA_POOLING_TYPE_UNSPECIFIED; // pooling type for embeddings enum llama_attention_type attention_type = LLAMA_ATTENTION_TYPE_UNSPECIFIED; // attention type for embeddings - struct common_params_sampling sampling; + struct common_params_sampling sampling; struct common_params_speculative speculative; + struct common_params_vocoder vocoder; std::string model = ""; // model path // NOLINT std::string model_alias = ""; // model alias // NOLINT @@ -286,8 +300,8 @@ struct common_params { bool warmup = true; // warmup run bool check_tensors = false; // validate tensor data - std::string cache_type_k = "f16"; // KV cache data type for the K - std::string cache_type_v = "f16"; // KV cache data type for the V + ggml_type cache_type_k = GGML_TYPE_F16; // KV cache data type for the K + ggml_type cache_type_v = GGML_TYPE_F16; // KV cache data type for the V // multimodal models (see examples/llava) std::string mmproj = ""; // path to multimodal projector // NOLINT @@ -437,6 +451,11 @@ std::vector string_split(const std::string & input, ch return parts; } +static bool string_starts_with(const std::string & str, + const std::string & prefix) { // While we wait for C++20's std::string::starts_with... + return str.rfind(prefix, 0) == 0; +} + bool string_parse_kv_override(const char * data, std::vector & overrides); void string_process_escapes(std::string & input); @@ -588,7 +607,8 @@ void common_kv_cache_dump_view_seqs(const llama_kv_cache_view & view, int row_si // Embedding utils // -void common_embd_normalize(const float * inp, float * out, int n, int embd_norm = 2); +// TODO: repace embd_norm with an enum +void common_embd_normalize(const float * inp, float * out, int n, int embd_norm); float common_embd_similarity_cos(const float * embd1, const float * embd2, int n); diff --git a/common/sampling.cpp b/common/sampling.cpp index bb09be4bbca..6328624e63c 100644 --- a/common/sampling.cpp +++ b/common/sampling.cpp @@ -183,21 +183,21 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, co params.penalty_present, params.penalize_nl, params.ignore_eos)); - + if (params.mirostat == 0) { for (const auto & cnstr : params.samplers) { switch (cnstr) { - case COMMON_SAMPLER_TYPE_DRY: + case COMMON_SAMPLER_TYPE_DRY: { - std::vector c_breakers; + std::vector c_breakers; c_breakers.reserve(params.dry_sequence_breakers.size()); - for (const auto& str : params.dry_sequence_breakers) { + for (const auto & str : params.dry_sequence_breakers) { c_breakers.push_back(str.c_str()); } llama_sampler_chain_add(result->chain, llama_sampler_init_dry (model, params.dry_multiplier, params.dry_base, params.dry_allowed_length, params.dry_penalty_last_n, c_breakers.data(), c_breakers.size())); } - break; + break; case COMMON_SAMPLER_TYPE_TOP_K: llama_sampler_chain_add(result->chain, llama_sampler_init_top_k (params.top_k)); break; @@ -219,6 +219,9 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, co case COMMON_SAMPLER_TYPE_INFILL: llama_sampler_chain_add(result->chain, llama_sampler_init_infill (model)); break; + case COMMON_SAMPLER_TYPE_PENALTIES: + llama_sampler_chain_add(result->chain, llama_sampler_init_penalties(params.penalty_last_n, params.penalty_repeat, params.penalty_freq, params.penalty_present)); + break; default: GGML_ASSERT(false && "unknown sampler type"); } @@ -447,6 +450,7 @@ char common_sampler_type_to_chr(enum common_sampler_type cnstr) { case COMMON_SAMPLER_TYPE_TEMPERATURE: return 't'; case COMMON_SAMPLER_TYPE_XTC: return 'x'; case COMMON_SAMPLER_TYPE_INFILL: return 'i'; + case COMMON_SAMPLER_TYPE_PENALTIES: return 'e'; default : return '?'; } } @@ -461,6 +465,7 @@ std::string common_sampler_type_to_str(enum common_sampler_type cnstr) { case COMMON_SAMPLER_TYPE_TEMPERATURE: return "temperature"; case COMMON_SAMPLER_TYPE_XTC: return "xtc"; case COMMON_SAMPLER_TYPE_INFILL: return "infill"; + case COMMON_SAMPLER_TYPE_PENALTIES: return "penalties"; default : return ""; } } @@ -475,6 +480,7 @@ std::vector common_sampler_types_from_names(const std::vect { "temperature", COMMON_SAMPLER_TYPE_TEMPERATURE }, { "xtc", COMMON_SAMPLER_TYPE_XTC }, { "infill", COMMON_SAMPLER_TYPE_INFILL }, + { "penalties", COMMON_SAMPLER_TYPE_PENALTIES }, }; // since samplers names are written multiple ways @@ -521,6 +527,7 @@ std::vector common_sampler_types_from_chars(const std::stri { common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_TEMPERATURE), COMMON_SAMPLER_TYPE_TEMPERATURE }, { common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_XTC), COMMON_SAMPLER_TYPE_XTC }, { common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_INFILL), COMMON_SAMPLER_TYPE_INFILL }, + { common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_PENALTIES), COMMON_SAMPLER_TYPE_PENALTIES }, }; std::vector samplers; diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index c63d929c187..b6c15da94ec 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -221,17 +221,17 @@ def set_gguf_parameters(self): self.gguf_writer.add_context_length(n_ctx) logger.info(f"gguf: context length = {n_ctx}") - n_embd = self.find_hparam(["hidden_size", "n_embd"]) - self.gguf_writer.add_embedding_length(n_embd) - logger.info(f"gguf: embedding length = {n_embd}") + if (n_embd := self.find_hparam(["hidden_size", "n_embd"], optional=True)) is not None: + self.gguf_writer.add_embedding_length(n_embd) + logger.info(f"gguf: embedding length = {n_embd}") if (n_ff := self.find_hparam(["intermediate_size", "n_inner"], optional=True)) is not None: self.gguf_writer.add_feed_forward_length(n_ff) logger.info(f"gguf: feed forward length = {n_ff}") - n_head = self.find_hparam(["num_attention_heads", "n_head"]) - self.gguf_writer.add_head_count(n_head) - logger.info(f"gguf: head count = {n_head}") + if (n_head := self.find_hparam(["num_attention_heads", "n_head"], optional=True)) is not None: + self.gguf_writer.add_head_count(n_head) + logger.info(f"gguf: head count = {n_head}") if (n_head_kv := self.hparams.get("num_key_value_heads")) is not None: self.gguf_writer.add_head_count_kv(n_head_kv) @@ -296,7 +296,9 @@ def prepare_tensors(self): break for new_name, data_torch in (self.modify_tensors(data_torch, name, bid)): - data = data_torch.squeeze().numpy() + # TODO: why do we squeeze here? + # data = data_torch.squeeze().numpy() + data = data_torch.numpy() # if data ends up empty, it means data_torch was a scalar tensor -> restore if len(data.shape) == 0: @@ -324,6 +326,8 @@ def prepare_tensors(self): gguf.MODEL_TENSOR.TIME_MIX_W2, gguf.MODEL_TENSOR.TIME_MIX_DECAY_W1, gguf.MODEL_TENSOR.TIME_MIX_DECAY_W2, + gguf.MODEL_TENSOR.POSNET_NORM1, + gguf.MODEL_TENSOR.POSNET_NORM2, ) ) or not new_name.endswith(".weight") @@ -525,9 +529,19 @@ def get_vocab_base(self) -> tuple[list[str], list[int], str]: else: token: str = reverse_vocab[i] if token in added_vocab: + # The tokenizer in llama.cpp assumes the CONTROL and USER_DEFINED tokens are pre-normalized. + # To avoid unexpected issues - we make sure to normalize non-normalized tokens + if not tokenizer.added_tokens_decoder[i].normalized: + previous_token = token + token = tokenizer.decode(tokenizer.encode(token, add_special_tokens=False)) + if previous_token != token: + logger.info(f"{repr(previous_token)} is encoded and decoded back to {repr(token)} using AutoTokenizer") + if tokenizer.added_tokens_decoder[i].special or self.does_token_look_special(token): toktypes.append(gguf.TokenType.CONTROL) else: + # NOTE: this was added for Gemma. + # Encoding and decoding the tokens above isn't sufficient for this case. token = token.replace(b"\xe2\x96\x81".decode("utf-8"), " ") # pre-normalize user-defined spaces toktypes.append(gguf.TokenType.USER_DEFINED) else: @@ -571,6 +585,9 @@ def get_vocab_base_pre(self, tokenizer) -> str: if chkhsh == "8aeee3860c56296a157a1fe2fad249ec40aa59b1bb5709f4ade11c4e6fe652ed": # ref: https://huggingface.co/tiiuae/falcon-7b res = "falcon" + if chkhsh == "9d032fcbd5501f4a38150912590928bfb36091efb5df11b8e2124b0390e3fb1e": + # ref: https://huggingface.co/tiiuae/Falcon3-7B-Base + res = "falcon3" if chkhsh == "0876d13b50744004aa9aeae05e7b0647eac9d801b5ba4668afc01e709c15e19f": # ref: https://huggingface.co/BAAI/bge-small-en-v1.5 res = "bert-bge" @@ -664,6 +681,12 @@ def get_vocab_base_pre(self, tokenizer) -> str: if chkhsh == "8b5a93ed704057481f240da0be7e7dca721d7f8f4755263b6807227a2cbeae65": # ref: https://huggingface.co/sentence-transformers/stsb-roberta-base res = "roberta-bpe" + if chkhsh == "ad851be1dba641f2e3711822f816db2c265f788b37c63b4e1aeacb9ee92de8eb": + # ref: https://huggingface.co/ai-sage/GigaChat-20B-A3B-instruct + res = "gigachat" + if chkhsh == "d4c8f286ea6b520b3d495c4455483cfa2302c0cfcd4be05d781b6a8a0a7cdaf1": + # ref: https://huggingface.co/Infinigence/Megrez-3B-Instruct + res = "megrez" if res is None: logger.warning("\n") @@ -686,6 +709,9 @@ def get_vocab_base_pre(self, tokenizer) -> str: return res # Marker: End get_vocab_base_pre + def _set_vocab_none(self) -> None: + self.gguf_writer.add_tokenizer_model("none") + def _set_vocab_gpt2(self) -> None: tokens, toktypes, tokpre = self.get_vocab_base() self.gguf_writer.add_tokenizer_model("gpt2") @@ -1669,6 +1695,184 @@ def prepare_tensors(self): raise ValueError(f"Unprocessed experts: {experts}") +@Model.register("DeciLMForCausalLM") +class DeciModel(Model): + model_arch = gguf.MODEL_ARCH.DECI + + @staticmethod + def _ffn_mult_to_intermediate_size(ffn_mult: float, n_embd: int) -> int: + # DeciLM-specific code + intermediate_size = int(2 * ffn_mult * n_embd / 3) + return DeciModel._find_multiple(intermediate_size, 256) + + @staticmethod + def _find_multiple(n: int, k: int) -> int: + # DeciLM-specific code + if n % k == 0: + return n + return n + k - (n % k) + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + + if "block_configs" in self.hparams: # Llama-3_1-Nemotron-51B + _block_configs: list[dict[str,Any]] = self.hparams["block_configs"] + assert self.block_count == len(_block_configs) + self._num_kv_heads = list() + self._num_heads = list() + _ffn_multipliers = list() + # ***linear attention layer*** + # if n_heads_in_group is None and replace_with_linear is True + # then _num_kv_heads[il] is 0 and _num_heads[il] is num_attention_heads + # ***attention-free layer*** + # if n_heads_in_group is None and replace_with_linear is False + # then _num_kv_heads[il] is 0 and _num_heads[il] is 0 + # ***normal attention-layer*** + # if n_heads_in_group is not None, then + # _num_kv_heads[il] is num_attention_head // n_heads_in_group and + # _num_heads[il] is num_attention_head + for il in range(len(_block_configs)): + if _block_configs[il]["attention"]["n_heads_in_group"] is None: + if _block_configs[il]["attention"]["replace_with_linear"] is True: + self._num_kv_heads.append(0) + self._num_heads.append(self.hparams["num_attention_heads"]) + else: + self._num_kv_heads.append(0) + self._num_heads.append(0) + else: + self._num_kv_heads.append(self.hparams["num_attention_heads"] // _block_configs[il]["attention"]["n_heads_in_group"]) + self._num_heads.append(self.hparams["num_attention_heads"]) + _ffn_multipliers.append(_block_configs[il]["ffn"]["ffn_mult"]) + assert self.block_count == len(self._num_kv_heads) + assert self.block_count == len(self._num_heads) + assert self.block_count == len(_ffn_multipliers) + assert isinstance(self._num_kv_heads, list) and isinstance(self._num_kv_heads[0], int) + assert isinstance(self._num_heads, list) and isinstance(self._num_heads[0], int) + assert isinstance(_ffn_multipliers, list) and isinstance(_ffn_multipliers[0], float) + self._ffn_dims: list[int] = [ + DeciModel._ffn_mult_to_intermediate_size(multiplier, self.hparams["hidden_size"]) + for multiplier in _ffn_multipliers + ] + + def set_vocab(self): + # Please change tokenizer_config.json of Llama-3_1-Nemotron-51B's + # eos_token from '|eot_id|' to '|end_of_text|' + if self.hparams.get("vocab_size", 128256) == 128256: + tokens, toktypes, tokpre = self.get_vocab_base() + self.gguf_writer.add_tokenizer_model("gpt2") + self.gguf_writer.add_tokenizer_pre(tokpre) + self.gguf_writer.add_token_list(tokens) + self.gguf_writer.add_token_types(toktypes) + + special_vocab = gguf.SpecialVocab( + self.dir_model, load_merges=True, + special_token_types = ['bos', 'eos', 'eom', 'eot'] + ) + special_vocab._set_special_token("bos", 128000) + special_vocab._set_special_token("eos", 128001) + special_vocab._set_special_token("eom", 128008) + special_vocab._set_special_token("eot", 128009) + special_vocab.add_to_gguf(self.gguf_writer) + else: + # DeciLM-7B + self._set_vocab_llama_hf() +# self._set_vocab_gpt2() + + def set_gguf_parameters(self): + if "block_configs" in self.hparams: # Llama-3_1-Nemotron-51B + assert self.block_count == len(self._num_kv_heads) + assert self.block_count == len(self._num_heads) + assert self.block_count == len(self._ffn_dims) + self.gguf_writer.add_head_count_kv(self._num_kv_heads) + self.gguf_writer.add_head_count(self._num_heads) + self.gguf_writer.add_feed_forward_length(self._ffn_dims) + self.gguf_writer.add_block_count(self.block_count) + self.gguf_writer.add_context_length(self.hparams["max_position_embeddings"]) + self.gguf_writer.add_embedding_length(self.hparams["hidden_size"]) + self.gguf_writer.add_layer_norm_rms_eps(self.hparams["rms_norm_eps"]) + self.gguf_writer.add_key_length(self.hparams["hidden_size"] // self.hparams["num_attention_heads"]) + self.gguf_writer.add_value_length(self.hparams["hidden_size"] // self.hparams["num_attention_heads"]) + self.gguf_writer.add_file_type(self.ftype) + else: # DeciLM-7B + super().set_gguf_parameters() + if "num_key_value_heads_per_layer" in self.hparams: # DeciLM-7B + self._num_kv_heads: list[int] = self.hparams["num_key_value_heads_per_layer"] + assert self.block_count == len(self._num_kv_heads) + self.gguf_writer.add_head_count_kv(self._num_kv_heads) + hparams = self.hparams + self.gguf_writer.add_vocab_size(hparams["vocab_size"]) + + if "head_dim" in hparams: + rope_dim = hparams["head_dim"] + else: + rope_dim = hparams["hidden_size"] // hparams["num_attention_heads"] + self.gguf_writer.add_rope_dimension_count(rope_dim) + + if self.hparams.get("rope_scaling") is not None and "factor" in self.hparams["rope_scaling"]: + if self.hparams["rope_scaling"].get("type") == "linear": + self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR) + self.gguf_writer.add_rope_scaling_factor(self.hparams["rope_scaling"]["factor"]) + + @staticmethod + def permute(weights: Tensor, n_head: int, n_head_kv: int | None): + if n_head_kv is not None and n_head != n_head_kv: + n_head = n_head_kv + return (weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:]) + .swapaxes(1, 2) + .reshape(weights.shape)) + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + n_head = self.hparams["num_attention_heads"] + if bid is not None: + if "num_key_value_heads_per_layer" in self.hparams: + n_kv_head = self.hparams["num_key_value_heads_per_layer"][bid] + elif "block_configs" in self.hparams: + n_kv_head = self._num_kv_heads[bid] + n_head = self._num_heads[bid] + else: + n_kv_head = self.hparams.get("num_key_value_heads") + else: + n_kv_head = self.hparams.get("num_key_value_heads") + + if name.endswith(("q_proj.weight", "q_proj.bias")): + data_torch = DeciModel.permute(data_torch, n_head, n_head) + if name.endswith(("k_proj.weight", "k_proj.bias")): + data_torch = DeciModel.permute(data_torch, n_head, n_kv_head) + return [(self.map_tensor_name(name), data_torch)] + + def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]: + if rope_scaling := self.find_hparam(["rope_scaling"], optional=True): + if rope_scaling.get("rope_type", '').lower() == "llama3": + base = self.hparams.get("rope_theta", 10000.0) + dim = self.hparams.get("head_dim", self.hparams["hidden_size"] // self.hparams["num_attention_heads"]) + freqs = 1.0 / (base ** (torch.arange(0, dim, 2, dtype=torch.float32) / dim)) + + factor = rope_scaling.get("factor", 8.0) + low_freq_factor = rope_scaling.get("low_freq_factor", 1.0) + high_freq_factor = rope_scaling.get("high_freq_factor", 4.0) + old_context_len = self.hparams.get("original_max_position_embeddings", 8192) + + low_freq_wavelen = old_context_len / low_freq_factor + high_freq_wavelen = old_context_len / high_freq_factor + assert low_freq_wavelen != high_freq_wavelen + + rope_factors = [] + for freq in freqs: + wavelen = 2 * math.pi / freq + if wavelen < high_freq_wavelen: + rope_factors.append(1) + elif wavelen > low_freq_wavelen: + rope_factors.append(factor) + else: + smooth = (old_context_len / wavelen - low_freq_factor) / (high_freq_factor - low_freq_factor) + rope_factors.append(1 / ((1 - smooth) / factor + smooth)) + + yield (self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FREQS), torch.tensor(rope_factors, dtype=torch.float32)) + + def prepare_tensors(self): + super().prepare_tensors() + + @Model.register("BitnetForCausalLM") class BitnetModel(Model): model_arch = gguf.MODEL_ARCH.BITNET @@ -2001,6 +2205,67 @@ def set_gguf_parameters(self): self.gguf_writer.add_rope_scaling_orig_ctx_len(self.hparams["rope_scaling"]["original_max_position_embeddings"]) +@Model.register("Qwen2VLForConditionalGeneration") +class Qwen2VLModel(Model): + model_arch = gguf.MODEL_ARCH.QWEN2VL + + def set_gguf_parameters(self): + super().set_gguf_parameters() + mrope_section = self.hparams["rope_scaling"]["mrope_section"] + mrope_section += [0] * max(0, 4 - len(mrope_section)) + self.gguf_writer.add_rope_dimension_sections(mrope_section) + + def set_vocab(self): + try: + self._set_vocab_sentencepiece() + except FileNotFoundError: + self._set_vocab_gpt2() + + def get_tensors(self) -> Iterator[tuple[str, Tensor]]: + for name, data in super().get_tensors(): + if name.startswith("visual."): + continue + yield name, data + + +@Model.register("WavTokenizerDec") +class WavTokenizerDecModel(Model): + model_arch = gguf.MODEL_ARCH.WAVTOKENIZER_DEC + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + del bid # unused + + if \ + name.endswith("codebook.cluster_size") or \ + name.endswith("codebook.embed_avg") or \ + name.endswith("codebook.inited"): + logger.debug(f"Skipping {name!r}") + return [] + + logger.info(f"{self.map_tensor_name(name)} -> {data_torch.shape}") + + return [(self.map_tensor_name(name), data_torch)] + + def set_vocab(self): + self._set_vocab_none() + + def set_gguf_parameters(self): + super().set_gguf_parameters() + self.gguf_writer.add_vocab_size (self.hparams["vocab_size"]) + self.gguf_writer.add_features_length (self.hparams["n_embd_features"]) + self.gguf_writer.add_feed_forward_length(self.hparams["n_ff"]) + self.gguf_writer.add_group_norm_eps (self.hparams["group_norm_epsilon"]) + self.gguf_writer.add_group_norm_groups (self.hparams["group_norm_groups"]) + + self.gguf_writer.add_posnet_embedding_length(self.hparams["posnet"]["n_embd"]) + self.gguf_writer.add_posnet_block_count (self.hparams["posnet"]["n_layer"]) + + self.gguf_writer.add_convnext_embedding_length(self.hparams["convnext"]["n_embd"]) + self.gguf_writer.add_convnext_block_count (self.hparams["convnext"]["n_layer"]) + + self.gguf_writer.add_causal_attention(False) + + @Model.register("Qwen2MoeForCausalLM") class Qwen2MoeModel(Model): model_arch = gguf.MODEL_ARCH.QWEN2MOE @@ -2129,6 +2394,15 @@ class Phi3MiniModel(Model): model_arch = gguf.MODEL_ARCH.PHI3 def set_vocab(self): + # Phi-4 model uses GPT2Tokenizer + tokenizer_config_file = self.dir_model / 'tokenizer_config.json' + if tokenizer_config_file.is_file(): + with open(tokenizer_config_file, "r", encoding="utf-8") as f: + tokenizer_config_json = json.load(f) + tokenizer_class = tokenizer_config_json['tokenizer_class'] + if tokenizer_class == 'GPT2Tokenizer': + return self._set_vocab_gpt2() + from sentencepiece import SentencePieceProcessor tokenizer_path = self.dir_model / 'tokenizer.model' @@ -2245,7 +2519,11 @@ def set_gguf_parameters(self): self.gguf_writer.add_rope_dimension_count(rope_dims) self.gguf_writer.add_rope_freq_base(self.find_hparam(["rope_theta"])) self.gguf_writer.add_file_type(self.ftype) - self.gguf_writer.add_sliding_window(self.find_hparam(["sliding_window"])) + sliding_window = self.hparams.get("sliding_window") + # use zero value of sliding_window to distinguish Phi-4 from other PHI3 models + if sliding_window is None: + sliding_window = 0 + self.gguf_writer.add_sliding_window(sliding_window) def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]: n_embd = self.find_hparam(["hidden_size", "n_embd"]) @@ -2544,7 +2822,7 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter return [(self.map_tensor_name(name), data_torch)] -@Model.register("BertModel", "CamembertModel", "RobertaModel") +@Model.register("BertModel", "BertForMaskedLM", "CamembertModel") class BertModel(Model): model_arch = gguf.MODEL_ARCH.BERT @@ -2610,13 +2888,73 @@ def phantom(tok): def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: del bid # unused + if name.startswith("bert."): + name = name[5:] + + if name.endswith(".gamma"): + name = name[:-6] + ".weight" + + if name.endswith(".beta"): + name = name[:-5] + ".bias" + # we are only using BERT for embeddings so we don't need the pooling layer if name in ("embeddings.position_ids", "pooler.dense.weight", "pooler.dense.bias"): return [] # we don't need these + if name.startswith("cls.predictions"): + return [] + + if name.startswith("cls.seq_relationship"): + return [] + return [(self.map_tensor_name(name), data_torch)] +@Model.register("RobertaModel") +class RobertaModel(BertModel): + model_arch = gguf.MODEL_ARCH.BERT + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + + # we need the pad_token_id to know how to chop down position_embd matrix + if (pad_token_id := self.hparams.get("pad_token_id")) is not None: + self._position_offset = 1 + pad_token_id + if "max_position_embeddings" in self.hparams: + self.hparams["max_position_embeddings"] -= self._position_offset + else: + self._position_offset = None + + def set_vocab(self): + """Support BPE tokenizers for roberta models""" + bpe_tok_path = self.dir_model / "tokenizer.json" + if bpe_tok_path.exists(): + self._set_vocab_gpt2() + self.gguf_writer.add_add_bos_token(True) + self.gguf_writer.add_add_eos_token(True) + + # we need this to validate the size of the token_type embeddings + # though currently we are passing all zeros to the token_type embeddings + # "Sequence A" or "Sequence B" + self.gguf_writer.add_token_type_count(self.hparams.get("type_vocab_size", 1)) + + else: + return super().set_vocab() + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + # if name starts with "roberta.", remove the prefix + # e.g. https://huggingface.co/BAAI/bge-reranker-v2-m3/tree/main + if name.startswith("roberta."): + name = name[8:] + + # position embeddings start at pad_token_id + 1, so just chop down the weight tensor + if name == "embeddings.position_embeddings.weight": + if self._position_offset is not None: + data_torch = data_torch[self._position_offset:,:] + + return super().modify_tensors(data_torch, name, bid) + + @Model.register("NomicBertModel") class NomicBertModel(BertModel): model_arch = gguf.MODEL_ARCH.NOMIC_BERT @@ -2936,6 +3274,9 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter if new_name.endswith("time_mix_w2.weight"): data_torch = data_torch.permute(0, 2, 1) + if new_name.endswith("time_mix_decay.weight") or "lerp" in new_name: + data_torch = data_torch.squeeze() + rescale_every_n_layers = self.hparams["rescale_every"] if rescale_every_n_layers > 0: if new_name.endswith("time_mix_output.weight") or new_name.endswith("channel_mix_value.weight"): @@ -3404,6 +3745,97 @@ def prepare_tensors(self): raise ValueError(f"Unprocessed experts: {experts}") +@Model.register("DeepseekForCausalLM") +class DeepseekModel(Model): + model_arch = gguf.MODEL_ARCH.DEEPSEEK + + def set_vocab(self): + try: + self._set_vocab_sentencepiece() + except FileNotFoundError: + self._set_vocab_gpt2() + + def set_gguf_parameters(self): + super().set_gguf_parameters() + hparams = self.hparams + if "head_dim" in hparams: + rope_dim = hparams["head_dim"] + else: + rope_dim = hparams["hidden_size"] // hparams["num_attention_heads"] + + self.gguf_writer.add_rope_dimension_count(rope_dim) + self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.NONE) + self.gguf_writer.add_leading_dense_block_count(hparams["first_k_dense_replace"]) + self.gguf_writer.add_vocab_size(hparams["vocab_size"]) + self.gguf_writer.add_expert_feed_forward_length(hparams["moe_intermediate_size"]) + self.gguf_writer.add_expert_weights_scale(1.0) + self.gguf_writer.add_expert_count(hparams["n_routed_experts"]) + self.gguf_writer.add_expert_shared_count(hparams["n_shared_experts"]) + + _experts: list[dict[str, Tensor]] | None = None + + @staticmethod + def permute(weights: Tensor, n_head: int, n_head_kv: int | None): + if n_head_kv is not None and n_head != n_head_kv: + n_head = n_head_kv + return (weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:]) + .swapaxes(1, 2) + .reshape(weights.shape)) + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + n_head = self.hparams["num_attention_heads"] + n_kv_head = self.hparams.get("num_key_value_heads") + + if name.endswith(("q_proj.weight", "q_proj.bias")): + data_torch = DeepseekModel.permute(data_torch, n_head, n_head) + if name.endswith(("k_proj.weight", "k_proj.bias")): + data_torch = DeepseekModel.permute(data_torch, n_head, n_kv_head) + + # process the experts separately + if name.find("mlp.experts") != -1: + n_experts = self.hparams["n_routed_experts"] + assert bid is not None + + if self._experts is None: + self._experts = [{} for _ in range(self.block_count)] + + self._experts[bid][name] = data_torch + + if len(self._experts[bid]) >= n_experts * 3: + tensors: list[tuple[str, Tensor]] = [] + + # merge the experts into a single 3d tensor + for w_name in ["down_proj", "gate_proj", "up_proj"]: + datas: list[Tensor] = [] + + for xid in range(n_experts): + ename = f"model.layers.{bid}.mlp.experts.{xid}.{w_name}.weight" + datas.append(self._experts[bid][ename]) + del self._experts[bid][ename] + + data_torch = torch.stack(datas, dim=0) + + merged_name = f"model.layers.{bid}.mlp.experts.{w_name}.weight" + + new_name = self.map_tensor_name(merged_name) + + tensors.append((new_name, data_torch)) + return tensors + else: + return [] + + return [(self.map_tensor_name(name), data_torch)] + + def prepare_tensors(self): + super().prepare_tensors() + + if self._experts is not None: + # flatten `list[dict[str, Tensor]]` into `list[str]` + experts = [k for d in self._experts for k in d.keys()] + if len(experts) > 0: + raise ValueError(f"Unprocessed experts: {experts}") + + @Model.register("DeepseekV2ForCausalLM") class DeepseekV2Model(Model): model_arch = gguf.MODEL_ARCH.DEEPSEEK2 diff --git a/convert_hf_to_gguf_update.py b/convert_hf_to_gguf_update.py index aa10e5db796..fea23ddb4ae 100755 --- a/convert_hf_to_gguf_update.py +++ b/convert_hf_to_gguf_update.py @@ -72,6 +72,7 @@ class TOKENIZER_TYPE(IntEnum): {"name": "deepseek-coder", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/deepseek-ai/deepseek-coder-6.7b-base", }, {"name": "falcon", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/tiiuae/falcon-7b", }, {"name": "bert-bge", "tokt": TOKENIZER_TYPE.WPM, "repo": "https://huggingface.co/BAAI/bge-small-en-v1.5", }, + {"name": "falcon3", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/tiiuae/Falcon3-7B-Base", }, {"name": "bert-bge-large", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/BAAI/bge-large-zh-v1.5", }, {"name": "mpt", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/mosaicml/mpt-7b", }, {"name": "starcoder", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/bigcode/starcoder2-3b", }, @@ -104,6 +105,8 @@ class TOKENIZER_TYPE(IntEnum): {"name": "chameleon", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/facebook/chameleon-7b", }, {"name": "minerva-7b", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/sapienzanlp/Minerva-7B-base-v1.0", }, {"name": "roberta-bpe", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/sentence-transformers/stsb-roberta-base"}, + {"name": "gigachat", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/ai-sage/GigaChat-20B-A3B-instruct"}, + {"name": "megrez", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/Infinigence/Megrez-3B-Instruct"}, ] diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt index 9210e9fea5e..66cfab2c3b7 100644 --- a/examples/CMakeLists.txt +++ b/examples/CMakeLists.txt @@ -20,7 +20,12 @@ else() add_subdirectory(batched) add_subdirectory(embedding) add_subdirectory(eval-callback) - add_subdirectory(gbnf-validator) + + if (NOT WIN32) + # disabled on Windows because it uses internal functions not exported with LLAMA_API + add_subdirectory(gbnf-validator) + endif() + add_subdirectory(gguf-hash) add_subdirectory(gguf-split) add_subdirectory(gguf) @@ -46,12 +51,17 @@ else() add_subdirectory(speculative) add_subdirectory(speculative-simple) add_subdirectory(tokenize) + add_subdirectory(tts) + add_subdirectory(gen-docs) if (NOT GGML_BACKEND_DL) # these examples use the backends directly and cannot be built with dynamic loading add_subdirectory(convert-llama2c-to-ggml) add_subdirectory(cvector-generator) add_subdirectory(export-lora) - add_subdirectory(quantize-stats) + if (NOT WIN32) + # disabled on Windows because it uses internal functions not exported with LLAMA_API + add_subdirectory(quantize-stats) + endif() add_subdirectory(llava) if (GGML_RPC) add_subdirectory(rpc) diff --git a/examples/batched/batched.cpp b/examples/batched/batched.cpp index ba219cd4b32..e2e01f2d598 100644 --- a/examples/batched/batched.cpp +++ b/examples/batched/batched.cpp @@ -65,6 +65,7 @@ int main(int argc, char ** argv) { llama_context * ctx = llama_new_context_with_model(model, ctx_params); auto sparams = llama_sampler_chain_default_params(); + sparams.no_perf = false; llama_sampler * smpl = llama_sampler_chain_init(sparams); diff --git a/examples/cvector-generator/mean.hpp b/examples/cvector-generator/mean.hpp index 16be5ce3eec..4eeac1eeb7a 100644 --- a/examples/cvector-generator/mean.hpp +++ b/examples/cvector-generator/mean.hpp @@ -15,7 +15,7 @@ static void run( for (size_t il = 0; il < v_input.size(); ++il) { // prepare output vector struct ggml_tensor * ctrl_out = v_output[il]; - ggml_format_name(ctrl_out, "direction.%ld", il+1); + ggml_format_name(ctrl_out, "direction.%zu", il+1); // calculate mean vector struct ggml_tensor * t_layer = v_input[il]; diff --git a/examples/cvector-generator/pca.hpp b/examples/cvector-generator/pca.hpp index f6e307fbc49..e88bbdde93f 100644 --- a/examples/cvector-generator/pca.hpp +++ b/examples/cvector-generator/pca.hpp @@ -302,7 +302,7 @@ static void run_pca( // prepare output vector struct ggml_tensor * ctrl_out = v_output[il]; - ggml_format_name(ctrl_out, "direction.%ld", il+1); + ggml_format_name(ctrl_out, "direction.%zu", il+1); // run power_iteration params.i_layer = il; diff --git a/examples/export-lora/export-lora.cpp b/examples/export-lora/export-lora.cpp index 67662313d07..058b5cc8602 100644 --- a/examples/export-lora/export-lora.cpp +++ b/examples/export-lora/export-lora.cpp @@ -265,8 +265,8 @@ struct lora_merge_ctx { fout.write((const char *)data.data(), data.size()); } - printf("%s : merged %ld tensors with lora adapters\n", __func__, n_merged); - printf("%s : wrote %ld tensors to output file\n", __func__, trans.size()); + printf("%s : merged %zu tensors with lora adapters\n", __func__, n_merged); + printf("%s : wrote %zu tensors to output file\n", __func__, trans.size()); } void copy_tensor(struct ggml_tensor * base) { @@ -352,7 +352,7 @@ struct lora_merge_ctx { const float scale = alpha ? adapters[i]->scale * alpha / rank : adapters[i]->scale; delta = ggml_scale(ctx0, delta, scale); cur = ggml_add(ctx0, delta, cur); - printf("%s : + merging from adapter[%ld] type=%s\n", __func__, i, ggml_type_name(inp_a[i]->type)); + printf("%s : + merging from adapter[%zu] type=%s\n", __func__, i, ggml_type_name(inp_a[i]->type)); printf("%s : input_scale=%f calculated_scale=%f rank=%d\n", __func__, adapters[i]->scale, scale, (int) inp_b[i]->ne[0]); } cur = ggml_cast(ctx0, cur, out->type); diff --git a/examples/gbnf-validator/gbnf-validator.cpp b/examples/gbnf-validator/gbnf-validator.cpp index 7493af9d3ae..17a0e27c444 100644 --- a/examples/gbnf-validator/gbnf-validator.cpp +++ b/examples/gbnf-validator/gbnf-validator.cpp @@ -11,19 +11,15 @@ static bool llama_grammar_validate(struct llama_grammar * grammar, const std::string & input_str, size_t & error_pos, std::string & error_msg) { const auto cpts = unicode_cpts_from_utf8(input_str); - const llama_grammar_rules & rules = llama_grammar_get_rules (grammar); - llama_grammar_stacks & stacks_cur = llama_grammar_get_stacks(grammar); + auto & stacks_cur = llama_grammar_get_stacks(grammar); size_t pos = 0; for (const auto & cpt : cpts) { - const llama_grammar_stacks stacks_prev = llama_grammar_get_stacks(grammar); // copy - - llama_grammar_accept(rules, stacks_prev, cpt, stacks_cur); + llama_grammar_accept(grammar, cpt); if (stacks_cur.empty()) { error_pos = pos; error_msg = "Unexpected character '" + unicode_cpt_to_utf8(cpt) + "'"; - stacks_cur = stacks_prev; return false; } ++pos; @@ -82,7 +78,8 @@ int main(int argc, char** argv) { llama_grammar * grammar = llama_grammar_init_impl(nullptr, grammar_str.c_str(), "root"); if (grammar == nullptr) { - throw std::runtime_error("Failed to initialize llama_grammar"); + fprintf(stdout, "Failed to initialize llama_grammar\n"); + return 1; } // Read the input file std::string input_str; diff --git a/examples/gguf-split/gguf-split.cpp b/examples/gguf-split/gguf-split.cpp index 7e62657e118..75f63f93891 100644 --- a/examples/gguf-split/gguf-split.cpp +++ b/examples/gguf-split/gguf-split.cpp @@ -287,7 +287,7 @@ struct split_strategy { } void print_info() { - printf("n_split: %ld\n", ctx_outs.size()); + printf("n_split: %zu\n", ctx_outs.size()); int i_split = 0; for (auto & ctx_out : ctx_outs) { // re-calculate the real gguf size for each split (= metadata size + total size of all tensors) @@ -297,7 +297,7 @@ struct split_strategy { total_size += ggml_nbytes(t); } total_size = total_size / 1000 / 1000; // convert to megabytes - printf("split %05d: n_tensors = %d, total_size = %ldM\n", i_split + 1, gguf_get_n_tensors(ctx_out), total_size); + printf("split %05d: n_tensors = %d, total_size = %zuM\n", i_split + 1, gguf_get_n_tensors(ctx_out), total_size); i_split++; } } diff --git a/examples/gritlm/gritlm.cpp b/examples/gritlm/gritlm.cpp index 6e42fa0734e..18a945b3390 100644 --- a/examples/gritlm/gritlm.cpp +++ b/examples/gritlm/gritlm.cpp @@ -75,7 +75,7 @@ static std::vector> encode(llama_context * ctx, const std::ve } std::vector emb_norm(emb_unorm.size()); - common_embd_normalize(emb_unorm.data(), emb_norm.data(), n_embd); + common_embd_normalize(emb_unorm.data(), emb_norm.data(), n_embd, 2); result.push_back(emb_norm); #ifdef GRIT_DEBUG diff --git a/examples/llama-bench/llama-bench.cpp b/examples/llama-bench/llama-bench.cpp index bac606f4716..2338ad1067d 100644 --- a/examples/llama-bench/llama-bench.cpp +++ b/examples/llama-bench/llama-bench.cpp @@ -1521,7 +1521,7 @@ int main(int argc, char ** argv) { for (const auto & inst : params_instances) { params_idx++; if (params.progress) { - fprintf(stderr, "llama-bench: benchmark %d/%ld: starting\n", params_idx, params_count); + fprintf(stderr, "llama-bench: benchmark %d/%zu: starting\n", params_idx, params_count); } // keep the same model between tests when possible if (!lmodel || !prev_inst || !inst.equal_mparams(*prev_inst)) { @@ -1573,14 +1573,14 @@ int main(int argc, char ** argv) { // warmup run if (t.n_prompt > 0) { if (params.progress) { - fprintf(stderr, "llama-bench: benchmark %d/%ld: warmup prompt run\n", params_idx, params_count); + fprintf(stderr, "llama-bench: benchmark %d/%zu: warmup prompt run\n", params_idx, params_count); } //test_prompt(ctx, std::min(t.n_batch, std::min(t.n_prompt, 32)), 0, t.n_batch, t.n_threads); test_prompt(ctx, t.n_prompt, t.n_batch, t.n_threads); } if (t.n_gen > 0) { if (params.progress) { - fprintf(stderr, "llama-bench: benchmark %d/%ld: warmup generation run\n", params_idx, params_count); + fprintf(stderr, "llama-bench: benchmark %d/%zu: warmup generation run\n", params_idx, params_count); } test_gen(ctx, 1, t.n_threads); } @@ -1592,14 +1592,14 @@ int main(int argc, char ** argv) { if (t.n_prompt > 0) { if (params.progress) { - fprintf(stderr, "llama-bench: benchmark %d/%ld: prompt run %d/%d\n", params_idx, params_count, + fprintf(stderr, "llama-bench: benchmark %d/%zu: prompt run %d/%d\n", params_idx, params_count, i + 1, params.reps); } test_prompt(ctx, t.n_prompt, t.n_batch, t.n_threads); } if (t.n_gen > 0) { if (params.progress) { - fprintf(stderr, "llama-bench: benchmark %d/%ld: generation run %d/%d\n", params_idx, params_count, + fprintf(stderr, "llama-bench: benchmark %d/%zu: generation run %d/%d\n", params_idx, params_count, i + 1, params.reps); } test_gen(ctx, t.n_gen, t.n_threads); diff --git a/examples/llama.android/llama/build.gradle.kts b/examples/llama.android/llama/build.gradle.kts index 2d1dfba2040..28dbc190488 100644 --- a/examples/llama.android/llama/build.gradle.kts +++ b/examples/llama.android/llama/build.gradle.kts @@ -19,6 +19,7 @@ android { externalNativeBuild { cmake { arguments += "-DLLAMA_BUILD_COMMON=ON" + arguments += "-DGGML_LLAMAFILE=OFF" arguments += "-DCMAKE_BUILD_TYPE=Release" cppFlags += listOf() arguments += listOf() diff --git a/examples/llava/CMakeLists.txt b/examples/llava/CMakeLists.txt index 5d32f377f26..3ce0d60c80d 100644 --- a/examples/llava/CMakeLists.txt +++ b/examples/llava/CMakeLists.txt @@ -43,3 +43,10 @@ set_target_properties(${TARGET} PROPERTIES OUTPUT_NAME llama-minicpmv-cli) install(TARGETS ${TARGET} RUNTIME) target_link_libraries(${TARGET} PRIVATE common llava ${CMAKE_THREAD_LIBS_INIT}) target_compile_features(${TARGET} PRIVATE cxx_std_17) + +set(TARGET llama-qwen2vl-cli) +add_executable(${TARGET} qwen2vl-cli.cpp) +set_target_properties(${TARGET} PROPERTIES OUTPUT_NAME llama-qwen2vl-cli) +install(TARGETS ${TARGET} RUNTIME) +target_link_libraries(${TARGET} PRIVATE common llava ${CMAKE_THREAD_LIBS_INIT}) +target_compile_features(${TARGET} PRIVATE cxx_std_17) diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp index d7c94352b56..3cd0d2fa859 100644 --- a/examples/llava/clip.cpp +++ b/examples/llava/clip.cpp @@ -8,25 +8,25 @@ #include "ggml-alloc.h" #include "ggml-backend.h" -#ifdef GGML_USE_CUDA -#include "ggml-cuda.h" -#endif - -#ifdef GGML_USE_SYCL -#include "ggml-sycl.h" -#endif - -#ifdef GGML_USE_METAL -#include "ggml-metal.h" -#endif - -#ifdef GGML_USE_CANN -#include "ggml-cann.h" -#endif - -#ifdef GGML_USE_VULKAN -#include "ggml-vulkan.h" -#endif +//#ifdef GGML_USE_CUDA +//#include "ggml-cuda.h" +//#endif +// +//#ifdef GGML_USE_SYCL +//#include "ggml-sycl.h" +//#endif +// +//#ifdef GGML_USE_METAL +//#include "ggml-metal.h" +//#endif +// +//#ifdef GGML_USE_CANN +//#include "ggml-cann.h" +//#endif +// +//#ifdef GGML_USE_VULKAN +//#include "ggml-vulkan.h" +//#endif #define STB_IMAGE_IMPLEMENTATION #include "stb_image.h" @@ -102,7 +102,9 @@ static std::string format(const char * fmt, ...) { #define KEY_HAS_LLAVA_PROJ "clip.has_llava_projector" #define KEY_HAS_MINICPMV_PROJ "clip.has_minicpmv_projector" #define KEY_MINICPMV_VERSION "clip.minicpmv_version" +#define KEY_HAS_QWEN2VL_MERGER "clip.has_qwen2vl_merger" #define KEY_USE_GELU "clip.use_gelu" +#define KEY_USE_SILU "clip.use_silu" #define KEY_N_EMBD "clip.%s.embedding_length" #define KEY_N_FF "clip.%s.feed_forward_length" #define KEY_N_BLOCK "clip.%s.block_count" @@ -129,7 +131,8 @@ static std::string format(const char * fmt, ...) { #define TN_TOKEN_EMBD "%s.token_embd.weight" #define TN_POS_EMBD "%s.position_embd.weight" #define TN_CLASS_EMBD "v.class_embd" -#define TN_PATCH_EMBD "v.patch_embd.weight" +#define TN_PATCH_EMBD "v.patch_embd.weight" // not rename tensor with ".0" postfix for backwrad compat +#define TN_PATCH_EMBD_1 "v.patch_embd.weight.1" #define TN_PATCH_BIAS "v.patch_embd.bias" #define TN_ATTN_K "%s.blk.%d.attn_k.%s" #define TN_ATTN_Q "%s.blk.%d.attn_q.%s" @@ -163,6 +166,7 @@ enum projector_type { PROJECTOR_TYPE_LDP, PROJECTOR_TYPE_LDPV2, PROJECTOR_TYPE_RESAMPLER, + PROJECTOR_TYPE_MERGER, PROJECTOR_TYPE_UNKNOWN, }; @@ -171,6 +175,7 @@ static std::map PROJECTOR_TYPE_NAMES = { { PROJECTOR_TYPE_LDP, "ldp" }, { PROJECTOR_TYPE_LDPV2, "ldpv2"}, { PROJECTOR_TYPE_RESAMPLER, "resampler"}, + { PROJECTOR_TYPE_MERGER, "qwen2vl_merger"}, }; @@ -463,7 +468,8 @@ struct clip_vision_model { // embeddings struct ggml_tensor * class_embedding; - struct ggml_tensor * patch_embeddings; + struct ggml_tensor * patch_embeddings_0; + struct ggml_tensor * patch_embeddings_1; // second Conv2D kernel when we decouple Conv3D along temproal dimension (Qwen2VL) struct ggml_tensor * patch_bias; struct ggml_tensor * position_embeddings; @@ -553,6 +559,7 @@ struct clip_ctx { bool has_vision_encoder = false; bool has_llava_projector = false; bool has_minicpmv_projector = false; + bool has_qwen2vl_merger = false; int minicpmv_version = 2; struct clip_vision_model vision_model; @@ -561,6 +568,7 @@ struct clip_ctx { float image_mean[3]; float image_std[3]; bool use_gelu = false; + bool use_silu = false; int32_t ftype = 1; bool has_class_embedding = true; @@ -606,14 +614,26 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32 image_size_height = imgs->data->ny; } } + else if (ctx->has_qwen2vl_merger) { + // use the image's native resolution when image is avaible + if (is_inf) { + // if (imgs->data->nx && imgs->data->ny) { + image_size_width = imgs->data->nx; + image_size_height = imgs->data->ny; + } + } const int patch_size = hparams.patch_size; const int num_patches = ((image_size_width / patch_size) * (image_size_height / patch_size)); + const int patches_w = image_size_width / patch_size; + const int patches_h = image_size_height / patch_size; const int num_positions = num_patches + (ctx->has_class_embedding ? 1 : 0); + const int num_position_ids = ctx->has_qwen2vl_merger ? num_positions * 4 : num_positions; const int hidden_size = hparams.hidden_size; const int n_head = hparams.n_head; const int d_head = hidden_size / n_head; int n_layer = hparams.n_layer; const float eps = hparams.eps; + int mrope_sections[4] = {d_head/4, d_head/4, d_head/4, d_head/4}; const int batch_size = imgs->size; @@ -634,10 +654,30 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32 ggml_set_name(inp_raw, "inp_raw"); ggml_set_input(inp_raw); - struct ggml_tensor * inp = ggml_conv_2d(ctx0, model.patch_embeddings, inp_raw, patch_size, patch_size, 0, 0, 1, 1); + struct ggml_tensor * inp = ggml_conv_2d(ctx0, model.patch_embeddings_0, inp_raw, patch_size, patch_size, 0, 0, 1, 1); - inp = ggml_reshape_3d(ctx0, inp, num_patches, hidden_size, batch_size); - inp = ggml_cont(ctx0, ggml_permute(ctx0, inp, 1, 0, 2, 3)); + if (ctx->has_qwen2vl_merger) { + GGML_ASSERT(image_size_width % (patch_size * 2) == 0); + GGML_ASSERT(image_size_height % (patch_size * 2) == 0); + + auto inp_1 = ggml_conv_2d(ctx0, model.patch_embeddings_1, inp_raw, patch_size, patch_size, 0, 0, 1, 1); + inp = ggml_add(ctx0, inp, inp_1); + inp = ggml_cont(ctx0, ggml_permute(ctx0, inp, 1, 2, 0, 3)); // [w, h, c, b] -> [c, w, h, b] + inp = ggml_reshape_4d( + ctx0, inp, + hidden_size * 2, patches_w / 2, patches_h, batch_size); + inp = ggml_reshape_4d( + ctx0, inp, + hidden_size * 2, patches_w / 2, 2, batch_size * (patches_h / 2)); + inp = ggml_cont(ctx0, ggml_permute(ctx0, inp, 0, 2, 1, 3)); + inp = ggml_reshape_3d( + ctx0, inp, + hidden_size, patches_w * patches_h, batch_size); + } + else { + inp = ggml_reshape_3d(ctx0, inp, num_patches, hidden_size, batch_size); + inp = ggml_cont(ctx0, ggml_permute(ctx0, inp, 1, 0, 2, 3)); + } if (ctx->has_patch_bias) { // inp = ggml_add(ctx0, inp, ggml_repeat(ctx0, model.patch_bias, inp)); @@ -659,12 +699,14 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32 } } - struct ggml_tensor * positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, num_positions); + struct ggml_tensor * positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, num_position_ids); ggml_set_name(positions, "positions"); ggml_set_input(positions); - embeddings = - ggml_add(ctx0, embeddings, ggml_get_rows(ctx0, model.position_embeddings, positions)); + if (!ctx->has_qwen2vl_merger) { // qwen2vl use rope position embedding + embeddings = + ggml_add(ctx0, embeddings, ggml_get_rows(ctx0, model.position_embeddings, positions)); + } if (ctx->has_minicpmv_projector) { int pos_w = image_size_width/patch_size; @@ -688,7 +730,8 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32 } // loop over layers - if (ctx->has_minicpmv_projector) { + if (ctx->has_minicpmv_projector || ctx->has_qwen2vl_merger) { + // TODO: figure out why we doing thing in this way ??? n_layer += 1; } for (int il = 0; il < n_layer - 1; il++) { @@ -710,8 +753,13 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32 struct ggml_tensor * Q = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].q_w, cur), model.layers[il].q_b); - Q = ggml_scale_inplace(ctx0, Q, 1.0f / sqrt((float)d_head)); Q = ggml_reshape_4d(ctx0, Q, d_head, n_head, num_positions, batch_size); + if (ctx->has_qwen2vl_merger) { + Q = ggml_rope_multi( + ctx0, Q, positions, nullptr, + d_head/2, mrope_sections, GGML_ROPE_TYPE_VISION, 32768, 10000, 1, 0, 1, 32, 1); + } + Q = ggml_scale_inplace(ctx0, Q, 1.0f / sqrt((float)d_head)); Q = ggml_cont(ctx0, ggml_permute(ctx0, Q, 0, 2, 1, 3)); Q = ggml_reshape_3d(ctx0, Q, d_head, num_positions, n_head * batch_size); @@ -719,6 +767,11 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32 ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].k_w, cur), model.layers[il].k_b); K = ggml_reshape_4d(ctx0, K, d_head, n_head, num_positions, batch_size); + if (ctx->has_qwen2vl_merger) { + K = ggml_rope_multi( + ctx0, K, positions, nullptr, + d_head/2, mrope_sections, GGML_ROPE_TYPE_VISION, 32768, 10000, 1, 0, 1, 32, 1); + } K = ggml_cont(ctx0, ggml_permute(ctx0, K, 0, 2, 1, 3)); K = ggml_reshape_3d(ctx0, K, d_head, num_positions, n_head * batch_size); @@ -758,6 +811,8 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32 if (ctx->use_gelu) { cur = ggml_gelu_inplace(ctx0, cur); + } else if (ctx->use_silu) { + cur = ggml_silu_inplace(ctx0, cur); } else { cur = ggml_gelu_quick_inplace(ctx0, cur); } @@ -769,6 +824,7 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32 cur = ggml_add(ctx0, embeddings, cur); embeddings = cur; + } // post-layernorm @@ -840,7 +896,7 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32 mlp_3 = ggml_cont(ctx0, ggml_permute(ctx0, mlp_3, 1, 0, 2, 3)); mlp_3 = ggml_reshape_4d(ctx0, mlp_3, n_patch, n_patch, mlp_3->ne[1], mlp_3->ne[2]); // stride = 1, padding = 1, bias is nullptr - block_1 = ggml_conv_depthwise_2d(ctx0, model.mm_model_block_1_block_0_0_w, mlp_3, 1, 1, 1, 1, 1, 1); + block_1 = ggml_conv_2d_dw(ctx0, model.mm_model_block_1_block_0_0_w, mlp_3, 1, 1, 1, 1, 1, 1); // layer norm // // block_1 shape = [1, 2048, 24, 24], ne = [24, 24, 2048, 1] @@ -888,7 +944,7 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32 // block_2 { // stride = 2 - block_1 = ggml_conv_depthwise_2d(ctx0, model.mm_model_block_2_block_0_0_w, block_1, 2, 2, 1, 1, 1, 1); + block_1 = ggml_conv_2d_dw(ctx0, model.mm_model_block_2_block_0_0_w, block_1, 2, 2, 1, 1, 1, 1); // block_1 shape = [1, 2048, 12, 12], ne = [12, 12, 2048, 1] // layer norm @@ -949,7 +1005,7 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32 // mlp_2 ne [24, 24, 2048, 1] mlp_2 = ggml_pool_2d(ctx0, mlp_2, GGML_OP_POOL_AVG, 2, 2, 2, 2, 0, 0); // weight ne = [3, 3, 2048, 1] - struct ggml_tensor * peg_0 = ggml_conv_depthwise_2d(ctx0, model.mm_model_peg_0_w, mlp_2, 1, 1, 1, 1, 1, 1); + struct ggml_tensor * peg_0 = ggml_conv_2d_dw(ctx0, model.mm_model_peg_0_w, mlp_2, 1, 1, 1, 1, 1, 1); peg_0 = ggml_cont(ctx0, ggml_permute(ctx0, peg_0, 1, 2, 0, 3)); peg_0 = ggml_add(ctx0, peg_0, model.mm_model_peg_0_b); mlp_2 = ggml_cont(ctx0, ggml_permute(ctx0, mlp_2, 1, 2, 0, 3)); @@ -1030,6 +1086,19 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32 GGML_ASSERT(false); } } + else if (ctx->proj_type == PROJECTOR_TYPE_MERGER) { + embeddings = ggml_reshape_3d(ctx0, embeddings, hidden_size * 4, num_positions / 4, batch_size); + + embeddings = ggml_mul_mat(ctx0, model.mm_0_w, embeddings); + embeddings = ggml_add(ctx0, embeddings, model.mm_0_b); + + // GELU activation + embeddings = ggml_gelu(ctx0, embeddings); + + // Second linear layer + embeddings = ggml_mul_mat(ctx0, model.mm_1_w, embeddings); + embeddings = ggml_add(ctx0, embeddings, model.mm_1_b); + } // build the graph ggml_build_forward_expand(gf, embeddings); @@ -1153,30 +1222,30 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) { } } -#ifdef GGML_USE_CUDA - new_clip->backend = ggml_backend_cuda_init(0); - LOG_INF("%s: CLIP using CUDA backend\n", __func__); -#endif - -#ifdef GGML_USE_METAL - new_clip->backend = ggml_backend_metal_init(); - LOG_INF("%s: CLIP using Metal backend\n", __func__); -#endif - -#ifdef GGML_USE_CANN - new_clip->backend = ggml_backend_cann_init(0); - LOG_INF("%s: CLIP using CANN backend\n", __func__); -#endif - -#ifdef GGML_USE_VULKAN - new_clip->backend = ggml_backend_vk_init(0); - LOG_INF("%s: CLIP using Vulkan backend\n", __func__); -#endif - -#ifdef GGML_USE_SYCL - new_clip->backend = ggml_backend_sycl_init(0); - LOG_INF("%s: CLIP using SYCL backend\n", __func__); -#endif +//#ifdef GGML_USE_CUDA +// new_clip->backend = ggml_backend_cuda_init(0); +// LOG_INF("%s: CLIP using CUDA backend\n", __func__); +//#endif +// +//#ifdef GGML_USE_METAL +// new_clip->backend = ggml_backend_metal_init(); +// LOG_INF("%s: CLIP using Metal backend\n", __func__); +//#endif +// +//#ifdef GGML_USE_CANN +// new_clip->backend = ggml_backend_cann_init(0); +// LOG_INF("%s: CLIP using CANN backend\n", __func__); +//#endif +// +//#ifdef GGML_USE_VULKAN +// new_clip->backend = ggml_backend_vk_init(0); +// LOG_INF("%s: CLIP using Vulkan backend\n", __func__); +//#endif +// +//#ifdef GGML_USE_SYCL +// new_clip->backend = ggml_backend_sycl_init(0); +// LOG_INF("%s: CLIP using SYCL backend\n", __func__); +//#endif if (!new_clip->backend) { new_clip->backend = ggml_backend_cpu_init(); @@ -1206,6 +1275,10 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) { new_clip->minicpmv_version = gguf_get_val_i32(ctx, idx); } + idx = gguf_find_key(ctx, KEY_HAS_QWEN2VL_MERGER); + if (idx != -1) { + new_clip->has_qwen2vl_merger = gguf_get_val_bool(ctx, idx); + } // GGML_ASSERT(new_clip->has_llava_projector); // see monatis/clip.cpp for image and/or text encoding for semantic search GGML_ASSERT(new_clip->has_vision_encoder); @@ -1214,6 +1287,13 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) { idx = get_key_idx(ctx, KEY_USE_GELU); new_clip->use_gelu = gguf_get_val_bool(ctx, idx); + try { + idx = get_key_idx(ctx, KEY_USE_SILU); + new_clip->use_silu = gguf_get_val_bool(ctx, idx); + } catch (std::runtime_error & /*e*/) { + new_clip->use_silu = false; + } + if (verbosity >= 1) { LOG_INF("%s: text_encoder: %d\n", __func__, new_clip->has_text_encoder); LOG_INF("%s: vision_encoder: %d\n", __func__, new_clip->has_vision_encoder); @@ -1389,11 +1469,16 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) { } try { - vision_model.patch_embeddings = get_tensor(new_clip->ctx_data, TN_PATCH_EMBD); + vision_model.patch_embeddings_0 = get_tensor(new_clip->ctx_data, TN_PATCH_EMBD); vision_model.position_embeddings = get_tensor(new_clip->ctx_data, format(TN_POS_EMBD, "v")); } catch(const std::exception& /*e*/) { LOG_ERR("%s: failed to load vision model tensors\n", __func__); } + try { + vision_model.patch_embeddings_1 = get_tensor(new_clip->ctx_data, TN_PATCH_EMBD_1); + } catch(const std::exception& /*e*/) { + new_clip->has_qwen2vl_merger = false; + } // LLaVA projection if (new_clip->proj_type == PROJECTOR_TYPE_MLP || new_clip->proj_type == PROJECTOR_TYPE_MLP_NORM) { @@ -1481,6 +1566,12 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) { vision_model.mm_model_ln_post_w = get_tensor(new_clip->ctx_data, format(TN_MINICPMV_LN, "post", "weight")); vision_model.mm_model_ln_post_b = get_tensor(new_clip->ctx_data, format(TN_MINICPMV_LN, "post", "bias")); } + else if (new_clip->proj_type == PROJECTOR_TYPE_MERGER) { + vision_model.mm_0_w = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 0, "weight")); + vision_model.mm_0_b = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 0, "bias")); + vision_model.mm_1_w = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 2, "weight")); + vision_model.mm_1_b = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 2, "bias")); + } else { std::string proj_type = PROJECTOR_TYPE_NAMES[new_clip->proj_type]; throw std::runtime_error(format("%s: don't support projector with: %s currently\n", __func__, proj_type.c_str())); @@ -1519,6 +1610,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) { new_clip->compute_alloc = ggml_gallocr_new(ggml_backend_get_default_buffer_type(new_clip->backend)); clip_image_f32_batch batch; batch.size = 1; + batch.data = nullptr; ggml_cgraph * gf = clip_image_build_graph(new_clip, &batch, nullptr, false); ggml_gallocr_reserve(new_clip->compute_alloc, gf); size_t compute_memory_buffer_size = ggml_gallocr_get_buffer_size(new_clip->compute_alloc, 0); @@ -1532,6 +1624,10 @@ void clip_add_load_image_size(struct clip_ctx * ctx_clip, struct clip_image_size ctx_clip->load_image_size = load_image_size; } +struct clip_image_size * clip_get_load_image_size(struct clip_ctx * ctx_clip) { + return ctx_clip->load_image_size; +} + struct clip_image_size * clip_image_size_init() { struct clip_image_size * load_image_size = new struct clip_image_size(); load_image_size->width = 448; @@ -1984,6 +2080,23 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, cli } return true; } + else if (ctx->has_qwen2vl_merger) { + clip_image_u8 * resized = clip_image_u8_init(); + auto patch_size = clip_patch_size(ctx) * 2; + int nx = ceil((float)img->nx / patch_size) * patch_size; + int ny = ceil((float)img->ny / patch_size) * patch_size; + bicubic_resize(*img, *resized, nx, ny); + + res_imgs->data = new clip_image_f32[1]; + // clip_image_f32 * res = clip_image_f32_init(); + normalize_image_u8_to_f32(resized, res_imgs->data, ctx->image_mean, ctx->image_std); + // res_imgs->data[0] = *res; + res_imgs->size = 1; + + // clip_image_f32_free(res); + clip_image_u8_free(resized); + return true; + } bool pad_to_square = true; if (!ctx->has_vision_encoder) { @@ -2173,6 +2286,13 @@ size_t clip_embd_nbytes(const struct clip_ctx * ctx) { return clip_n_patches(ctx) * clip_n_mmproj_embd(ctx) * sizeof(float); } +size_t clip_embd_nbytes_by_img(const struct clip_ctx * ctx, int img_h, int img_w) { + clip_image_f32 img; + img.nx = img_w; + img.ny = img_h; + return clip_n_patches_by_img(ctx, &img) * clip_n_mmproj_embd(ctx) * sizeof(float); +} + int32_t clip_image_size(const struct clip_ctx * ctx) { return ctx->vision_model.hparams.image_size; } @@ -2194,6 +2314,13 @@ const int32_t * clip_image_grid(const struct clip_ctx * ctx) { } int clip_n_patches(const struct clip_ctx * ctx) { + clip_image_f32 img; + img.nx = ctx->vision_model.hparams.image_size; + img.ny = ctx->vision_model.hparams.image_size; + return clip_n_patches_by_img(ctx, &img); +} + +int clip_n_patches_by_img(const struct clip_ctx * ctx, struct clip_image_f32 * img) { const auto & params = ctx->vision_model.hparams; int n_patches = (params.image_size / params.patch_size) * (params.image_size / params.patch_size); @@ -2207,6 +2334,11 @@ int clip_n_patches(const struct clip_ctx * ctx) { else if (ctx->minicpmv_version == 3) { n_patches = 64; } + } else if (ctx->proj_type == PROJECTOR_TYPE_MERGER) { + int patch_size = params.patch_size * 2; + int x_patch = img->nx / patch_size + (int)(img->nx % patch_size > 0); + int y_patch = img->ny / patch_size + (int)(img->ny % patch_size > 0); + n_patches = x_patch * y_patch; } return n_patches; @@ -2335,7 +2467,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima const int image_size = hparams.image_size; int image_size_width = image_size; int image_size_height = image_size; - if (ctx->has_minicpmv_projector) { + if (ctx->has_minicpmv_projector | ctx->has_qwen2vl_merger) { image_size_width = imgs->data[0].nx; image_size_height = imgs->data[0].ny; } @@ -2355,7 +2487,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima for (size_t i = 0; i < imgs->size; i++) { const int nx = imgs->data[i].nx; const int ny = imgs->data[i].ny; - if (!ctx->has_minicpmv_projector) { + if (!(ctx->has_minicpmv_projector | ctx->has_qwen2vl_merger)) { GGML_ASSERT(nx == image_size && ny == image_size); } @@ -2413,9 +2545,9 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima auto pos_embed_t = get_2d_sincos_pos_embed(embed_dim, std::make_pair(pos_w, pos_h)); float * pos_embed_data = (float *)malloc(ggml_nbytes(pos_embed)); - for(int i=0;ihas_qwen2vl_merger) { + struct ggml_tensor * positions = ggml_graph_get_tensor(gf, "positions"); + + const int pw = image_size_width / patch_size; + const int ph = image_size_height / patch_size; + int* positions_data = (int*)malloc(ggml_nbytes(positions)); + + int ptr = 0; + for (int y = 0; y < ph; y+=2) + { + for (int x = 0; x < pw; x+=2) + { + for (int dy = 0; dy < 2; dy++) { + for (int dx = 0; dx < 2; dx++) { + positions_data[ptr] = y + dy; + positions_data[num_patches + ptr] = x + dx; + positions_data[num_patches * 2 + ptr] = y + dy; + positions_data[num_patches * 3 + ptr] = x + dx; + ptr++; + } + } + } + } + + ggml_backend_tensor_set(positions, positions_data, 0, ggml_nbytes(positions)); + free(positions_data); + } + else { struct ggml_tensor * positions = ggml_graph_get_tensor(gf, "positions"); int* positions_data = (int*)malloc(ggml_nbytes(positions)); @@ -2444,16 +2603,16 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima } ggml_backend_tensor_set(positions, positions_data, 0, ggml_nbytes(positions)); free(positions_data); - } - { - struct ggml_tensor * patches = ggml_graph_get_tensor(gf, "patches"); - int* patches_data = (int*)malloc(ggml_nbytes(patches)); - for (int i = 0; i < num_patches; i++) { - patches_data[i] = i + 1; + { + struct ggml_tensor * patches = ggml_graph_get_tensor(gf, "patches"); + int* patches_data = (int*)malloc(ggml_nbytes(patches)); + for (int i = 0; i < num_patches; i++) { + patches_data[i] = i + 1; + } + ggml_backend_tensor_set(patches, patches_data, 0, ggml_nbytes(patches)); + free(patches_data); } - ggml_backend_tensor_set(patches, patches_data, 0, ggml_nbytes(patches)); - free(patches_data); } } @@ -2626,6 +2785,9 @@ int clip_n_mmproj_embd(const struct clip_ctx * ctx) { return 3584; } } + if (ctx->proj_type == PROJECTOR_TYPE_MERGER) { + return ctx->vision_model.mm_1_b->ne[0]; + } std::string proj_type = PROJECTOR_TYPE_NAMES[ctx->proj_type]; throw std::runtime_error(format("%s: don't support projector with: %s currently\n", __func__, proj_type.c_str())); @@ -2637,3 +2799,21 @@ int clip_is_minicpmv(const struct clip_ctx * ctx) { } return 0; } + +bool clip_is_qwen2vl(const struct clip_ctx * ctx) { + return ctx->has_qwen2vl_merger; +} + + +bool clip_encode_float_image (struct clip_ctx * ctx, int n_threads, float * img, int h, int w, float * vec) { + clip_image_f32 clip_img; + clip_img.buf.resize(h * w * 3); + for (int i = 0; i < h*w*3; i++) + { + clip_img.buf[i] = img[i]; + } + clip_img.nx = w; + clip_img.ny = h; + clip_image_encode(ctx, n_threads, &clip_img, vec); + return true; +} diff --git a/examples/llava/clip.h b/examples/llava/clip.h index 78588bdf174..1603edd265e 100644 --- a/examples/llava/clip.h +++ b/examples/llava/clip.h @@ -45,6 +45,7 @@ CLIP_API struct clip_ctx * clip_model_load_cpu(const char * fname, int verbosity CLIP_API void clip_free(struct clip_ctx * ctx); CLIP_API size_t clip_embd_nbytes(const struct clip_ctx * ctx); +CLIP_API size_t clip_embd_nbytes_by_img(const struct clip_ctx * ctx, int img_h, int img_w); CLIP_API int32_t clip_image_size (const struct clip_ctx * ctx); CLIP_API int32_t clip_patch_size (const struct clip_ctx * ctx); @@ -55,11 +56,13 @@ CLIP_API const char * clip_patch_merge_type(const struct clip_ctx * ctx); CLIP_API const int32_t * clip_image_grid(const struct clip_ctx * ctx); -CLIP_API int clip_n_patches (const struct clip_ctx * ctx); -CLIP_API int clip_n_mmproj_embd(const struct clip_ctx * ctx); +CLIP_API int clip_n_patches (const struct clip_ctx * ctx); +CLIP_API int clip_n_patches_by_img (const struct clip_ctx * ctx, struct clip_image_f32 * img); +CLIP_API int clip_n_mmproj_embd (const struct clip_ctx * ctx); CLIP_API int clip_uhd_num_image_embeds_col(struct clip_ctx * ctx_clip); CLIP_API void clip_add_load_image_size(struct clip_ctx * ctx_clip, struct clip_image_size * load_image_size); +CLIP_API struct clip_image_size * clip_get_load_image_size(struct clip_ctx * ctx_clip); CLIP_API struct clip_image_size * clip_image_size_init(); CLIP_API struct clip_image_u8 * clip_image_u8_init (); @@ -86,6 +89,9 @@ CLIP_API bool clip_image_batch_encode(struct clip_ctx * ctx, int n_threads, cons CLIP_API bool clip_model_quantize(const char * fname_inp, const char * fname_out, int itype); CLIP_API int clip_is_minicpmv(const struct clip_ctx * ctx); +CLIP_API bool clip_is_qwen2vl(const struct clip_ctx * ctx); + +CLIP_API bool clip_encode_float_image (struct clip_ctx * ctx, int n_threads, float * img, int h, int w, float * vec); #ifdef __cplusplus } diff --git a/examples/llava/llava.cpp b/examples/llava/llava.cpp index 4ca53a0b883..16f30c56cea 100644 --- a/examples/llava/llava.cpp +++ b/examples/llava/llava.cpp @@ -259,25 +259,33 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli const char * mm_patch_merge_type = clip_patch_merge_type(ctx_clip); - if (clip_is_minicpmv(ctx_clip)) { + if (clip_is_minicpmv(ctx_clip) || clip_is_qwen2vl(ctx_clip)) { std::vector image_embd_v; image_embd_v.resize(img_res_v.size); struct clip_image_size * load_image_size = clip_image_size_init(); + for (size_t i = 0; i < img_res_v.size; i++) { const int64_t t_img_enc_step_start_us = ggml_time_us(); - image_embd_v[i] = (float *)malloc(clip_embd_nbytes(ctx_clip)); + image_embd_v[i] = (float *)malloc(clip_embd_nbytes_by_img(ctx_clip, img_res_v.data[i].nx, img_res_v.data[i].ny)); int patch_size=14; load_image_size->width = img_res_v.data[i].nx; load_image_size->height = img_res_v.data[i].ny; clip_add_load_image_size(ctx_clip, load_image_size); + bool encoded = false; - int has_minicpmv_projector = clip_is_minicpmv(ctx_clip); - if (has_minicpmv_projector == 2) { - encoded = clip_image_encode(ctx_clip, n_threads, only_v2_5_reshape_by_patch(&img_res_v.data[i], patch_size), image_embd_v[i]); - } - else if (has_minicpmv_projector == 3) { + if (clip_is_qwen2vl(ctx_clip)) { encoded = clip_image_encode(ctx_clip, n_threads, &img_res_v.data[i], image_embd_v[i]); } + else { + int has_minicpmv_projector = clip_is_minicpmv(ctx_clip); + if (has_minicpmv_projector == 2) { + encoded = clip_image_encode(ctx_clip, n_threads, only_v2_5_reshape_by_patch(&img_res_v.data[i], patch_size), image_embd_v[i]); + } + else if (has_minicpmv_projector == 3) { + encoded = clip_image_encode(ctx_clip, n_threads, &img_res_v.data[i], image_embd_v[i]); + } + } + if (!encoded) { LOG_ERR("Unable to encode image - spatial_unpad - subimage %d of %d\n", (int) i+1, (int) img_res_v.size); return false; @@ -290,8 +298,11 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli int n_img_pos_out = 0; for (size_t i = 0; i < image_embd_v.size(); i++) { - std::memcpy(image_embd + n_img_pos_out * clip_n_mmproj_embd(ctx_clip), image_embd_v[i], clip_embd_nbytes(ctx_clip)); - n_img_pos_out += clip_n_patches(ctx_clip); + std::memcpy( + image_embd + n_img_pos_out * clip_n_mmproj_embd(ctx_clip), + image_embd_v[i], + clip_embd_nbytes_by_img(ctx_clip, img_res_v.data[i].nx, img_res_v.data[i].ny)); + n_img_pos_out += clip_n_patches_by_img(ctx_clip, &img_res_v.data[i]); } *n_img_pos = n_img_pos_out; for (size_t i = 0; i < image_embd_v.size(); i++) { @@ -387,7 +398,13 @@ bool llava_image_embed_make_with_clip_img(clip_ctx * ctx_clip, int n_threads, co if (clip_is_minicpmv(ctx_clip)) { num_max_patches = 10; } - float * image_embd = (float *)malloc(clip_embd_nbytes(ctx_clip)*num_max_patches); // TODO: base on gridsize/llava model + float * image_embd; + if (clip_is_qwen2vl(ctx_clip)) { + // qwen2vl don't split image into chunks, so `num_max_patches` is not needed. + image_embd = (float *)malloc(clip_embd_nbytes_by_img(ctx_clip, img->nx, img->ny)); + } else { + image_embd = (float *)malloc(clip_embd_nbytes(ctx_clip)*num_max_patches); // TODO: base on gridsize/llava model + } if (!image_embd) { LOG_ERR("Unable to allocate memory for image embeddings\n"); return false; diff --git a/examples/llava/qwen2_vl_surgery.py b/examples/llava/qwen2_vl_surgery.py new file mode 100644 index 00000000000..c87606b4fdf --- /dev/null +++ b/examples/llava/qwen2_vl_surgery.py @@ -0,0 +1,165 @@ +import argparse +from typing import Dict + +import torch +import numpy as np +from gguf import * +from transformers import ( + Qwen2VLForConditionalGeneration, + Qwen2VLProcessor, + AutoProcessor, + Qwen2VLConfig +) + + +VISION = "clip.vision" + + +def k(raw_key: str, arch: str) -> str: + return raw_key.format(arch=arch) + + +def to_gguf_name(name: str) -> str: + og = name + name = name.replace("text_model", "t").replace("vision_model", "v") + name = name.replace("blocks", "blk").replace("embeddings.", "") + name = name.replace("attn.", "attn_") + name = name.replace("mlp.fc1", "ffn_down").replace("mlp.fc2", "ffn_up").replace("proj.", "out.") + # name = name.replace("layrnorm", "ln").replace("layer_norm", "ln").replace("layernorm", "ln") + name = name.replace("norm1", "ln1").replace("norm2", "ln2") + name = name.replace("merger.mlp", 'mm') + print(f"[to_gguf_name] {og} --> {name}") + return name + + +def find_vision_tensors(qwen2vl, dtype) -> Dict[str, np.ndarray]: + vision_model = qwen2vl.visual + tensor_map = {} + for name, ten in vision_model.state_dict().items(): + ten = ten.numpy() + if 'qkv' in name: + if ten.ndim == 2: # weight + c3, _ = ten.shape + else: # bias + c3 = ten.shape[0] + assert c3 % 3 == 0 + c = c3 // 3 + wq = ten[:c] + wk = ten[c: c * 2] + wv = ten[c * 2:] + tensor_map[to_gguf_name(f"vision_model.{name}").replace("qkv", "q")] = wq + tensor_map[to_gguf_name(f"vision_model.{name}").replace("qkv", "k")] = wk + tensor_map[to_gguf_name(f"vision_model.{name}").replace("qkv", "v")] = wv + elif 'merger' in name: + if name.endswith("ln_q.weight"): + tensor_map['v.post_ln.weight'] = ten + elif name.endswith("ln_q.bias"): + tensor_map['v.post_ln.bias'] = ten + else: + # "merger.mlp.%d.weight/bias" --> "mm.%d.weight/bias" + tensor_map[to_gguf_name(name)] = ten + elif 'patch_embed.proj.weight' in name: + # NOTE: split Conv3D into Conv2Ds + c1, c2, kt, kh, kw = ten.shape + assert kt == 2, "Current implmentation only support temporal_patch_size of 2" + tensor_map["v.patch_embd.weight"] = ten[:, :, 0, ...] + tensor_map["v.patch_embd.weight.1"] = ten[:, :, 1, ...] + else: + tensor_map[to_gguf_name(f"vision_model.{name}")] = ten + + for new_name, ten in tensor_map.items(): + if ten.ndim <= 1 or new_name.endswith("_norm.weight"): + tensor_map[new_name] = ten.astype(np.float32) + else: + tensor_map[new_name] = ten.astype(dtype) + tensor_map["v.position_embd.weight"] = np.zeros([10, 10], dtype=np.float32) # dummy tensor, just here as a placeholder + return tensor_map + + +def main(args): + if args.data_type == 'fp32': + dtype = torch.float32 + np_dtype = np.float32 + ftype = 0 + elif args.data_type == 'fp16': + dtype = torch.float32 + np_dtype = np.float16 + ftype = 1 + else: + raise ValueError() + + local_model = False + model_path = "" + model_name = args.model_name + print("model_name: ", model_name) + qwen2vl = Qwen2VLForConditionalGeneration.from_pretrained( + model_name, torch_dtype=dtype, device_map="cpu" + ) + cfg: Qwen2VLConfig = qwen2vl.config # type: ignore[reportAssignmentType] + vcfg = cfg.vision_config + + if os.path.isdir(model_name): + local_model = True + if model_name.endswith(os.sep): + model_name = model_name[:-1] + model_path = model_name + model_name = os.path.basename(model_name) + fname_out = f"{model_name.replace('/', '-').lower()}-vision.gguf" + + fout = GGUFWriter(path=fname_out, arch="clip") + fout.add_description("image encoder for Qwen2VL") + + fout.add_file_type(ftype) + fout.add_bool("clip.has_text_encoder", False) + fout.add_bool("clip.has_vision_encoder", True) + fout.add_bool("clip.has_qwen2vl_merger", True) + fout.add_string("clip.projector_type", "qwen2vl_merger") + + print(cfg.vision_config) + if 'silu' in cfg.vision_config.hidden_act.lower(): + fout.add_bool("clip.use_silu", True) + fout.add_bool("clip.use_gelu", False) + elif 'gelu' in cfg.vision_config.hidden_act.lower(): + fout.add_bool("clip.use_silu", False) + fout.add_bool("clip.use_gelu", 'quick' not in cfg.vision_config.hidden_act.lower()) + else: + raise ValueError() + + tensor_map = find_vision_tensors(qwen2vl, np_dtype) + for name, data in tensor_map.items(): + fout.add_tensor(name, data) + + fout.add_uint32("clip.vision.patch_size", vcfg.patch_size) + fout.add_uint32("clip.vision.image_size", 14 * 40) # some reasonable size that is divable by (14*2) + fout.add_uint32(k(KEY_EMBEDDING_LENGTH, VISION), vcfg.embed_dim) + fout.add_uint32("clip.vision.projection_dim", vcfg.hidden_size) + fout.add_uint32(k(KEY_ATTENTION_HEAD_COUNT, VISION), vcfg.num_heads) + fout.add_float32(k(KEY_ATTENTION_LAYERNORM_EPS, VISION), 1e-6) + fout.add_uint32(k(KEY_BLOCK_COUNT, VISION), vcfg.depth) + fout.add_uint32(k(KEY_FEED_FORWARD_LENGTH, VISION), 0) # not sure what this does, put 0 here as a placeholder + fout.add_name(model_name) + """ + HACK: Since vision rope related parameter aren't stored in the `Qwen2VLConfig, + it will be hardcoded in the `clip_image_build_graph` from `clip.cpp`. + """ + + if local_model: + processor: Qwen2VLProcessor = AutoProcessor.from_pretrained(model_path) + else: + processor: Qwen2VLProcessor = AutoProcessor.from_pretrained(model_name) + fout.add_array("clip.vision.image_mean", processor.image_processor.image_mean) # type: ignore[reportAttributeAccessIssue] + fout.add_array("clip.vision.image_std", processor.image_processor.image_std) # type: ignore[reportAttributeAccessIssue] + + fout.write_header_to_file() + fout.write_kv_data_to_file() + fout.write_tensors_to_file() + fout.close() + print("save model as: ", fname_out) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("model_name", nargs='?', default="Qwen/Qwen2-VL-2B-Instruct") + parser.add_argument("--data_type", nargs='?', choices=['fp32', 'fp16'], default="fp32") + args = parser.parse_args() + main(args) diff --git a/examples/llava/qwen2vl-cli.cpp b/examples/llava/qwen2vl-cli.cpp new file mode 100644 index 00000000000..e86a60280ae --- /dev/null +++ b/examples/llava/qwen2vl-cli.cpp @@ -0,0 +1,581 @@ +#include "arg.h" +#include "base64.hpp" +#include "log.h" +#include "common.h" +#include "sampling.h" +#include "clip.h" +#include "llava.h" +#include "llama.h" +#include "ggml.h" + +#ifdef GGML_USE_CUDA +#include "ggml-cuda.h" +#endif +#ifdef NDEBUG +#include "ggml-alloc.h" +#include "ggml-backend.h" +#endif + +#include +#include +#include +#include +#include +#include +#include + + +static bool qwen2vl_eval_image_embed(llama_context * ctx_llama, const struct llava_image_embed * image_embed, + int n_batch, int * n_past, int * st_pos_id, struct clip_image_size * image_size) { + int n_embd = llama_n_embd(llama_get_model(ctx_llama)); + const int patch_size = 14 * 2; + const int ph = image_size->height / patch_size + (image_size->height % patch_size > 0); + const int pw = image_size->width / patch_size + (image_size->width % patch_size > 0); + auto img_tokens = image_embed->n_image_pos; + // llama_pos mrope_pos[img_tokens * 4]; + std::vector mrope_pos; + mrope_pos.resize(img_tokens * 4); + + for (int y = 0; y < ph; y++) + { + for (int x = 0; x < pw; x++) + { + int i = y * pw + x; + mrope_pos[i] = *st_pos_id; + mrope_pos[i + img_tokens] = *st_pos_id + y; + mrope_pos[i + img_tokens * 2] = *st_pos_id + x; + mrope_pos[i + img_tokens * 3] = 0; + } + } + *st_pos_id += std::max(pw, ph); + + int processed = 0; + std::vector batch_mrope_pos; + batch_mrope_pos.resize(img_tokens * 4); + + for (int i = 0; i < img_tokens; i += n_batch) { + int n_eval = img_tokens - i; + if (n_eval > n_batch) { + n_eval = n_batch; + } + + // llama_pos batch_mrope_pos[n_eval * 4]; + std::fill(batch_mrope_pos.begin(), batch_mrope_pos.end(), 0); + memcpy(batch_mrope_pos.data(), &mrope_pos[processed], n_eval * sizeof(llama_pos)); + memcpy(&batch_mrope_pos[n_eval * 1], &mrope_pos[img_tokens * 1 + processed], n_eval * sizeof(llama_pos)); + memcpy(&batch_mrope_pos[n_eval * 2], &mrope_pos[img_tokens * 2 + processed], n_eval * sizeof(llama_pos)); + memcpy(&batch_mrope_pos[n_eval * 3], &mrope_pos[img_tokens * 3 + processed], n_eval * sizeof(llama_pos)); + + llama_batch batch = { + int32_t(n_eval), // n_tokens + nullptr, // token + (image_embed->embed+i*n_embd), // embed + batch_mrope_pos.data(), // pos + nullptr, // n_seq_id + nullptr, // seq_id + nullptr, // logits + }; + + if (llama_decode(ctx_llama, batch)) { + LOG_ERR("%s : failed to eval\n", __func__); + return false; + } + *n_past += n_eval; + processed += n_eval; + } + return true; +} + + +static bool eval_tokens(struct llama_context * ctx_llama, std::vector tokens, int n_batch, int * n_past, int * st_pos_id) { + int N = (int) tokens.size(); + std::vector pos; + for (int i = 0; i < N; i += n_batch) { + int n_eval = (int) tokens.size() - i; + if (n_eval > n_batch) { + n_eval = n_batch; + } + auto batch = llama_batch_get_one(&tokens[i], n_eval); + // TODO: add mrope pos ids somewhere else + pos.resize(batch.n_tokens * 4); + std::fill(pos.begin(), pos.end(), 0); + for (int j = 0; j < batch.n_tokens * 3; j ++) { + pos[j] = *st_pos_id + (j % batch.n_tokens); + } + batch.pos = pos.data(); + + if (llama_decode(ctx_llama, batch)) { + LOG_ERR("%s : failed to eval. token %d/%d (batch size %d, n_past %d)\n", __func__, i, N, n_batch, *n_past); + return false; + } + *n_past += n_eval; + *st_pos_id += n_eval; + } + return true; +} + +static bool eval_id(struct llama_context * ctx_llama, int id, int * n_past, int * st_pos_id) { + std::vector tokens; + tokens.push_back(id); + return eval_tokens(ctx_llama, tokens, 1, n_past, st_pos_id); +} + +static bool eval_string(struct llama_context * ctx_llama, const char* str, int n_batch, int * n_past, int * st_pos_id, bool add_bos){ + std::string str2 = str; + std::vector embd_inp = common_tokenize(ctx_llama, str2, add_bos, true); + eval_tokens(ctx_llama, embd_inp, n_batch, n_past, st_pos_id); + return true; +} + +static const char * sample(struct common_sampler * smpl, + struct llama_context * ctx_llama, + int * n_past, int * st_pos_id) { + const llama_token id = common_sampler_sample(smpl, ctx_llama, -1); + common_sampler_accept(smpl, id, true); + static std::string ret; + if (llama_token_is_eog(llama_get_model(ctx_llama), id)) { + ret = ""; + } else { + ret = common_token_to_piece(ctx_llama, id); + } + eval_id(ctx_llama, id, n_past, st_pos_id); + return ret.c_str(); +} + +static const char* IMG_BASE64_TAG_BEGIN = ""; + +static void find_image_tag_in_prompt(const std::string& prompt, size_t& begin_out, size_t& end_out) { + begin_out = prompt.find(IMG_BASE64_TAG_BEGIN); + end_out = prompt.find(IMG_BASE64_TAG_END, (begin_out == std::string::npos) ? 0UL : begin_out); +} + +static bool prompt_contains_image(const std::string& prompt) { + size_t begin, end; + find_image_tag_in_prompt(prompt, begin, end); + return (begin != std::string::npos); +} + +// replaces the base64 image tag in the prompt with `replacement` +static llava_image_embed * llava_image_embed_make_with_prompt_base64(struct clip_ctx * ctx_clip, int n_threads, const std::string& prompt) { + size_t img_base64_str_start, img_base64_str_end; + find_image_tag_in_prompt(prompt, img_base64_str_start, img_base64_str_end); + if (img_base64_str_start == std::string::npos || img_base64_str_end == std::string::npos) { + LOG_ERR("%s: invalid base64 image tag. must be %s%s\n", __func__, IMG_BASE64_TAG_BEGIN, IMG_BASE64_TAG_END); + return NULL; + } + + auto base64_bytes_start = img_base64_str_start + strlen(IMG_BASE64_TAG_BEGIN); + auto base64_bytes_count = img_base64_str_end - base64_bytes_start; + auto base64_str = prompt.substr(base64_bytes_start, base64_bytes_count ); + + auto required_bytes = base64::required_encode_size(base64_str.size()); + auto img_bytes = std::vector(required_bytes); + base64::decode(base64_str.begin(), base64_str.end(), img_bytes.begin()); + + auto embed = llava_image_embed_make_with_bytes(ctx_clip, n_threads, img_bytes.data(), img_bytes.size()); + if (!embed) { + LOG_ERR("%s: could not load image from base64 string.\n", __func__); + return NULL; + } + + return embed; +} + +static std::string remove_image_from_prompt(const std::string& prompt, const char * replacement = "") { + size_t begin, end; + find_image_tag_in_prompt(prompt, begin, end); + if (begin == std::string::npos || end == std::string::npos) { + return prompt; + } + auto pre = prompt.substr(0, begin); + auto post = prompt.substr(end + strlen(IMG_BASE64_TAG_END)); + return pre + replacement + post; +} + +struct llava_context { + struct clip_ctx * ctx_clip = NULL; + struct llama_context * ctx_llama = NULL; + struct llama_model * model = NULL; +}; + +static void print_usage(int, char ** argv) { + LOG("\n example usage:\n"); + LOG("\n %s -m --mmproj --image --image [--temp 0.1] [-p \"describe the image in detail.\"]\n", argv[0]); + LOG("\n note: a lower temperature value like 0.1 is recommended for better quality.\n"); +} + +static struct llava_image_embed * load_image(llava_context * ctx_llava, common_params * params, const std::string & fname) { + + // load and preprocess the image + llava_image_embed * embed = NULL; + auto prompt = params->prompt; + if (prompt_contains_image(prompt)) { + if (!params->image.empty()) { + LOG_INF("using base64 encoded image instead of command line image path\n"); + } + embed = llava_image_embed_make_with_prompt_base64(ctx_llava->ctx_clip, params->cpuparams.n_threads, prompt); + if (!embed) { + LOG_ERR("%s: can't load image from prompt\n", __func__); + return NULL; + } + params->prompt = remove_image_from_prompt(prompt); + } else { + embed = llava_image_embed_make_with_filename(ctx_llava->ctx_clip, params->cpuparams.n_threads, fname.c_str()); + if (!embed) { + fprintf(stderr, "%s: is %s really an image file?\n", __func__, fname.c_str()); + return NULL; + } + } + + return embed; +} + +static void process_prompt(struct llava_context * ctx_llava, struct llava_image_embed * image_embed, common_params * params, const std::string & prompt) { + int n_past = 0; + int cur_pos_id = 0; + + const int max_tgt_len = params->n_predict < 0 ? 256 : params->n_predict; + + std::string system_prompt, user_prompt; + size_t image_pos = prompt.find("<|vision_start|>"); + if (image_pos != std::string::npos) { + // new templating mode: Provide the full prompt including system message and use as a placeholder for the image + system_prompt = prompt.substr(0, image_pos); + user_prompt = prompt.substr(image_pos + std::string("<|vision_pad|>").length()); + LOG_INF("system_prompt: %s\n", system_prompt.c_str()); + if (params->verbose_prompt) { + auto tmp = common_tokenize(ctx_llava->ctx_llama, system_prompt, true, true); + for (int i = 0; i < (int) tmp.size(); i++) { + LOG_INF("%6d -> '%s'\n", tmp[i], common_token_to_piece(ctx_llava->ctx_llama, tmp[i]).c_str()); + } + } + LOG_INF("user_prompt: %s\n", user_prompt.c_str()); + if (params->verbose_prompt) { + auto tmp = common_tokenize(ctx_llava->ctx_llama, user_prompt, true, true); + for (int i = 0; i < (int) tmp.size(); i++) { + LOG_INF("%6d -> '%s'\n", tmp[i], common_token_to_piece(ctx_llava->ctx_llama, tmp[i]).c_str()); + } + } + } else { + // llava-1.5 native mode + system_prompt = "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<|vision_start|>"; + user_prompt = "<|vision_end|>" + prompt + "<|im_end|>\n<|im_start|>assistant\n"; + if (params->verbose_prompt) { + auto tmp = common_tokenize(ctx_llava->ctx_llama, user_prompt, true, true); + for (int i = 0; i < (int) tmp.size(); i++) { + LOG_INF("%6d -> '%s'\n", tmp[i], common_token_to_piece(ctx_llava->ctx_llama, tmp[i]).c_str()); + } + } + } + + eval_string(ctx_llava->ctx_llama, system_prompt.c_str(), params->n_batch, &n_past, &cur_pos_id, true); + if (image_embed != nullptr) { + auto image_size = clip_get_load_image_size(ctx_llava->ctx_clip); + qwen2vl_eval_image_embed(ctx_llava->ctx_llama, image_embed, params->n_batch, &n_past, &cur_pos_id, image_size); + } + eval_string(ctx_llava->ctx_llama, user_prompt.c_str(), params->n_batch, &n_past, &cur_pos_id, false); + + // generate the response + + LOG("\n"); + + struct common_sampler * smpl = common_sampler_init(ctx_llava->model, params->sampling); + if (!smpl) { + LOG_ERR("%s: failed to initialize sampling subsystem\n", __func__); + exit(1); + } + + std::string response = ""; + for (int i = 0; i < max_tgt_len; i++) { + const char * tmp = sample(smpl, ctx_llava->ctx_llama, &n_past, &cur_pos_id); + response += tmp; + if (strcmp(tmp, "") == 0) break; + if (strstr(tmp, "###")) break; // Yi-VL behavior + LOG("%s", tmp); + if (strstr(response.c_str(), "<|im_end|>")) break; // Yi-34B llava-1.6 - for some reason those decode not as the correct token (tokenizer works) + if (strstr(response.c_str(), "<|im_start|>")) break; // Yi-34B llava-1.6 + if (strstr(response.c_str(), "USER:")) break; // mistral llava-1.6 + + fflush(stdout); + } + + common_sampler_free(smpl); + LOG("\n"); +} + +static struct llama_model * llava_init(common_params * params) { + llama_backend_init(); + llama_numa_init(params->numa); + + llama_model_params model_params = common_model_params_to_llama(*params); + + llama_model * model = llama_load_model_from_file(params->model.c_str(), model_params); + if (model == NULL) { + LOG_ERR("%s: unable to load model\n" , __func__); + return NULL; + } + return model; +} + +static struct llava_context * llava_init_context(common_params * params, llama_model * model) { + const char * clip_path = params->mmproj.c_str(); + + auto prompt = params->prompt; + if (prompt.empty()) { + prompt = "describe the image in detail."; + } + + auto ctx_clip = clip_model_load(clip_path, /*verbosity=*/ 1); + + + llama_context_params ctx_params = common_context_params_to_llama(*params); + ctx_params.n_ctx = params->n_ctx < 2048 ? 2048 : params->n_ctx; // we need a longer context size to process image embeddings + + llama_context * ctx_llama = llama_new_context_with_model(model, ctx_params); + + if (ctx_llama == NULL) { + LOG_ERR("%s: failed to create the llama_context\n" , __func__); + return NULL; + } + + auto * ctx_llava = (struct llava_context *)malloc(sizeof(llava_context)); + + ctx_llava->ctx_llama = ctx_llama; + ctx_llava->ctx_clip = ctx_clip; + ctx_llava->model = model; + return ctx_llava; +} + +static void llava_free(struct llava_context * ctx_llava) { + if (ctx_llava->ctx_clip) { + clip_free(ctx_llava->ctx_clip); + ctx_llava->ctx_clip = NULL; + } + + llama_free(ctx_llava->ctx_llama); + llama_free_model(ctx_llava->model); + llama_backend_free(); +} + +#ifndef NDEBUG + +static void debug_test_mrope_2d() { + // 1. Initialize backend + ggml_backend_t backend = NULL; + std::string backend_name = ""; +#ifdef GGML_USE_CUDA + fprintf(stderr, "%s: using CUDA backend\n", __func__); + backend = ggml_backend_cuda_init(0); // init device 0 + backend_name = "cuda"; + if (!backend) { + fprintf(stderr, "%s: ggml_backend_cuda_init() failed\n", __func__); + } +#endif + // if there aren't GPU Backends fallback to CPU backend + if (!backend) { + backend = ggml_backend_cpu_init(); + backend_name = "cpu"; + } + + // Calculate the size needed to allocate + size_t ctx_size = 0; + ctx_size += 2 * ggml_tensor_overhead(); // tensors + // no need to allocate anything else! + + // 2. Allocate `ggml_context` to store tensor data + struct ggml_init_params params = { + /*.mem_size =*/ ctx_size, + /*.mem_buffer =*/ NULL, + /*.no_alloc =*/ true, // the tensors will be allocated later by ggml_backend_alloc_ctx_tensors() + }; + struct ggml_context * ctx = ggml_init(params); + + struct ggml_tensor * inp_raw = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, 128, 12, 30); + ggml_set_name(inp_raw, "inp_raw"); + ggml_set_input(inp_raw); + + struct ggml_tensor * pos = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 30 * 4); + ggml_set_name(pos, "pos"); + ggml_set_input(pos); + + std::vector dummy_q; + dummy_q.resize(128 * 12 * 30); + std::fill(dummy_q.begin(), dummy_q.end(), 0.1); + // memcpy(inp_raw->data, dummy_q.data(), 128 * 12 * 30 * ggml_element_size(inp_raw)); + + std::vector pos_id; + pos_id.resize(30 * 4); + for (int i = 0; i < 30; i ++) { + pos_id[i] = i; + pos_id[i + 30] = i + 10; + pos_id[i + 60] = i + 20; + pos_id[i + 90] = i + 30; + } + int sections[4] = {32, 32, 0, 0}; + + // 4. Allocate a `ggml_backend_buffer` to store all tensors + ggml_backend_buffer_t buffer = ggml_backend_alloc_ctx_tensors(ctx, backend); + + // 5. Copy tensor data from main memory (RAM) to backend buffer + ggml_backend_tensor_set(inp_raw, dummy_q.data(), 0, ggml_nbytes(inp_raw)); + ggml_backend_tensor_set(pos, pos_id.data(), 0, ggml_nbytes(pos)); + + // 6. Create a `ggml_cgraph` for mul_mat operation + struct ggml_cgraph * gf = NULL; + struct ggml_context * ctx_cgraph = NULL; + + // create a temporally context to build the graph + struct ggml_init_params params0 = { + /*.mem_size =*/ ggml_tensor_overhead()*GGML_DEFAULT_GRAPH_SIZE + ggml_graph_overhead(), + /*.mem_buffer =*/ NULL, + /*.no_alloc =*/ true, // the tensors will be allocated later by ggml_gallocr_alloc_graph() + }; + ctx_cgraph = ggml_init(params0); + gf = ggml_new_graph(ctx_cgraph); + + struct ggml_tensor * result0 = ggml_rope_multi( + ctx_cgraph, inp_raw, pos, nullptr, + 128/2, sections, LLAMA_ROPE_TYPE_VISION, 32768, 1000000, 1, + 0, 1, 32, 1); + + // Add "result" tensor and all of its dependencies to the cgraph + ggml_build_forward_expand(gf, result0); + + // 7. Create a `ggml_gallocr` for cgraph computation + ggml_gallocr_t allocr = ggml_gallocr_new(ggml_backend_get_default_buffer_type(backend)); + ggml_gallocr_alloc_graph(allocr, gf); + + // 9. Run the computation + int n_threads = 1; // Optional: number of threads to perform some operations with multi-threading + if (ggml_backend_is_cpu(backend)) { + ggml_backend_cpu_set_n_threads(backend, n_threads); + } + ggml_backend_graph_compute(backend, gf); + + // 10. Retrieve results (output tensors) + // in this example, output tensor is always the last tensor in the graph + struct ggml_tensor * result = result0; + // struct ggml_tensor * result = gf->nodes[gf->n_nodes - 1]; + float * result_data = (float *)malloc(ggml_nbytes(result)); + // because the tensor data is stored in device buffer, we need to copy it back to RAM + ggml_backend_tensor_get(result, result_data, 0, ggml_nbytes(result)); + const std::string bin_file = "mrope_2d_" + backend_name +".bin"; + std::ofstream outFile(bin_file, std::ios::binary); + + if (outFile.is_open()) { + outFile.write(reinterpret_cast(result_data), ggml_nbytes(result)); + outFile.close(); + std::cout << "Data successfully written to " + bin_file << std::endl; + } else { + std::cerr << "Error opening file!" << std::endl; + } + + free(result_data); + // 11. Free memory and exit + ggml_free(ctx_cgraph); + ggml_gallocr_free(allocr); + ggml_free(ctx); + ggml_backend_buffer_free(buffer); + ggml_backend_free(backend); +} + +static void debug_dump_img_embed(struct llava_context * ctx_llava) { + int n_embd = llama_n_embd(llama_get_model(ctx_llava->ctx_llama)); + int ne = n_embd * 4; + float vals[56 * 56 * 3]; + // float embd[ne]; + std::vector embd; + embd.resize(ne); + + for (int i = 0; i < 56*56; i++) + { + for (int c = 0; c < 3; c++) + vals[i * 3 + c] = (float)(i % (56 * 56)) / (56*56); + } + + clip_encode_float_image(ctx_llava->ctx_clip, 16, vals, 56, 56, embd.data()); + + std::ofstream outFile("img_embed.bin", std::ios::binary); + if (outFile.is_open()) { + outFile.write(reinterpret_cast(embd.data()), ne * sizeof(float)); + + outFile.close(); + std::cout << "Data successfully written to mrope.bin" << std::endl; + } else { + std::cerr << "Error opening file!" << std::endl; + } +} + +#endif + + +int main(int argc, char ** argv) { + ggml_time_init(); + + common_params params; + + if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_LLAVA, print_usage)) { + return 1; + } + + common_init(); + + if (params.mmproj.empty() || (params.image.empty() && !prompt_contains_image(params.prompt))) { + print_usage(argc, argv); + return 1; + } + + auto * model = llava_init(¶ms); + if (model == NULL) { + fprintf(stderr, "%s: error: failed to init llava model\n", __func__); + return 1; + } + + if (prompt_contains_image(params.prompt)) { + auto * ctx_llava = llava_init_context(¶ms, model); + + auto * image_embed = load_image(ctx_llava, ¶ms, ""); + + // process the prompt + process_prompt(ctx_llava, image_embed, ¶ms, params.prompt); + + llama_perf_context_print(ctx_llava->ctx_llama); + llava_image_embed_free(image_embed); + ctx_llava->model = NULL; + llava_free(ctx_llava); +#ifndef NDEBUG + } else if (params.image[0].empty()) { + auto ctx_llava = llava_init_context(¶ms, model); + + debug_test_mrope_2d(); + debug_dump_img_embed(ctx_llava); + + llama_perf_context_print(ctx_llava->ctx_llama); + ctx_llava->model = NULL; + llava_free(ctx_llava); +#endif + } else { + for (auto & image : params.image) { + auto * ctx_llava = llava_init_context(¶ms, model); + + auto * image_embed = load_image(ctx_llava, ¶ms, image); + if (!image_embed) { + LOG_ERR("%s: failed to load image %s. Terminating\n\n", __func__, image.c_str()); + return 1; + } + + // process the prompt + process_prompt(ctx_llava, image_embed, ¶ms, params.prompt); + + llama_perf_context_print(ctx_llava->ctx_llama); + llava_image_embed_free(image_embed); + ctx_llava->model = NULL; + llava_free(ctx_llava); + } + } + + llama_free_model(model); + + return 0; +} diff --git a/examples/main/README.md b/examples/main/README.md index 7787f7b11b8..17d80a622a8 100644 --- a/examples/main/README.md +++ b/examples/main/README.md @@ -177,16 +177,11 @@ Example usage: `--temp 0` - `--repeat-penalty N`: Control the repetition of token sequences in the generated text default: 1.0, 1.0 = disabled). - `--repeat-last-n N`: Last n tokens to consider for penalizing repetition (default: 64, 0 = disabled, -1 = ctx-size). -- `--no-penalize-nl`: Disable penalization for newline tokens when applying the repeat penalty. The `repeat-penalty` option helps prevent the model from generating repetitive or monotonous text. A higher value (e.g., 1.5) will penalize repetitions more strongly, while a lower value (e.g., 0.9) will be more lenient. The default value is 1. The `repeat-last-n` option controls the number of tokens in the history to consider for penalizing repetition. A larger value will look further back in the generated text to prevent repetitions, while a smaller value will only consider recent tokens. A value of 0 disables the penalty, and a value of -1 sets the number of tokens considered equal to the context size (`ctx-size`). -Use the `--no-penalize-nl` option to disable newline penalization when applying the repeat penalty. This option is particularly useful for generating chat conversations, dialogues, code, poetry, or any text where newline tokens play a significant role in structure and formatting. Disabling newline penalization helps maintain the natural flow and intended formatting in these specific use cases. - -Example usage: `--repeat-penalty 1.15 --repeat-last-n 128 --no-penalize-nl` - ### DRY Repetition Penalty DRY (Don't Repeat Yourself) sampling is an effective technique for reducing repetition in generated text even across long contexts by penalizing tokens based on their recent usage patterns (original [PR link](https://github.com/oobabooga/text-generation-webui/pull/5677)). diff --git a/examples/quantize/README.md b/examples/quantize/README.md index 5d1e11c67b1..f9cce7b2133 100644 --- a/examples/quantize/README.md +++ b/examples/quantize/README.md @@ -81,7 +81,7 @@ Several quantization methods are supported. They differ in the resulting model d - [#4930 - imatrix for all k-quants](https://github.com/ggerganov/llama.cpp/pull/4930) - [#4951 - imatrix on the GPU](https://github.com/ggerganov/llama.cpp/pull/4957) - [#4969 - imatrix for legacy quants](https://github.com/ggerganov/llama.cpp/pull/4969) - - [#4996 - k-qunats tuning](https://github.com/ggerganov/llama.cpp/pull/4996) + - [#4996 - k-quants tuning](https://github.com/ggerganov/llama.cpp/pull/4996) - [#5060 - Q3_K_XS](https://github.com/ggerganov/llama.cpp/pull/5060) - [#5196 - 3-bit i-quants](https://github.com/ggerganov/llama.cpp/pull/5196) - [quantization tuning](https://github.com/ggerganov/llama.cpp/pull/5320), [another one](https://github.com/ggerganov/llama.cpp/pull/5334), and [another one](https://github.com/ggerganov/llama.cpp/pull/5361) diff --git a/examples/retrieval/retrieval.cpp b/examples/retrieval/retrieval.cpp index e78a8596d8c..a5c6fe7e585 100644 --- a/examples/retrieval/retrieval.cpp +++ b/examples/retrieval/retrieval.cpp @@ -107,7 +107,7 @@ static void batch_decode(llama_context * ctx, llama_batch & batch, float * outpu } float * out = output + batch.seq_id[i][0] * n_embd; - common_embd_normalize(embd, out, n_embd); + common_embd_normalize(embd, out, n_embd, 2); } } @@ -143,7 +143,7 @@ int main(int argc, char ** argv) { std::vector file_chunk = chunk_file(context_file, params.chunk_size, params.chunk_separator); chunks.insert(chunks.end(), file_chunk.begin(), file_chunk.end()); } - LOG_INF("Number of chunks: %ld\n", chunks.size()); + LOG_INF("Number of chunks: %zu\n", chunks.size()); llama_backend_init(); llama_numa_init(params.numa); diff --git a/examples/rpc/rpc-server.cpp b/examples/rpc/rpc-server.cpp index 5fe70dac7f1..8b1b23edad1 100644 --- a/examples/rpc/rpc-server.cpp +++ b/examples/rpc/rpc-server.cpp @@ -12,6 +12,10 @@ #include "ggml-vulkan.h" #endif +#ifdef GGML_USE_SYCL +#include "ggml-sycl.h" +#endif + #include "ggml-rpc.h" #ifdef _WIN32 # include @@ -91,6 +95,12 @@ static ggml_backend_t create_backend() { if (!backend) { fprintf(stderr, "%s: ggml_backend_vulkan_init() failed\n", __func__); } +#elif GGML_USE_SYCL + fprintf(stderr, "%s: using SYCL backend\n", __func__); + backend = ggml_backend_sycl_init(0); // init device 0 + if (!backend) { + fprintf(stderr, "%s: ggml_backend_sycl_init() failed\n", __func__); + } #endif // if there aren't GPU Backends fallback to CPU backend @@ -106,6 +116,8 @@ static void get_backend_memory(size_t * free_mem, size_t * total_mem) { ggml_backend_cuda_get_device_memory(0, free_mem, total_mem); #elif GGML_USE_VULKAN ggml_backend_vk_get_device_memory(0, free_mem, total_mem); +#elif GGML_USE_SYCL + ggml_backend_sycl_get_device_memory(0, free_mem, total_mem); #else #ifdef _WIN32 MEMORYSTATUSEX status; diff --git a/examples/run/CMakeLists.txt b/examples/run/CMakeLists.txt index 52add51ef77..0686d630570 100644 --- a/examples/run/CMakeLists.txt +++ b/examples/run/CMakeLists.txt @@ -1,5 +1,5 @@ set(TARGET llama-run) add_executable(${TARGET} run.cpp) install(TARGETS ${TARGET} RUNTIME) -target_link_libraries(${TARGET} PRIVATE llama ${CMAKE_THREAD_LIBS_INIT}) +target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) target_compile_features(${TARGET} PRIVATE cxx_std_17) diff --git a/examples/run/README.md b/examples/run/README.md index 6e926811f3c..a0680544120 100644 --- a/examples/run/README.md +++ b/examples/run/README.md @@ -3,5 +3,49 @@ The purpose of this example is to demonstrate a minimal usage of llama.cpp for running models. ```bash -./llama-run Meta-Llama-3.1-8B-Instruct.gguf -... +llama-run granite-code +``` + +```bash +llama-run -h +Description: + Runs a llm + +Usage: + llama-run [options] model [prompt] + +Options: + -c, --context-size + Context size (default: 2048) + -n, --ngl + Number of GPU layers (default: 0) + --temp + Temperature (default: 0.8) + -v, --verbose, --log-verbose + Set verbosity level to infinity (i.e. log all messages, useful for debugging) + -h, --help + Show help message + +Commands: + model + Model is a string with an optional prefix of + huggingface:// (hf://), ollama://, https:// or file://. + If no protocol is specified and a file exists in the specified + path, file:// is assumed, otherwise if a file does not exist in + the specified path, ollama:// is assumed. Models that are being + pulled are downloaded with .partial extension while being + downloaded and then renamed as the file without the .partial + extension when complete. + +Examples: + llama-run llama3 + llama-run ollama://granite-code + llama-run ollama://smollm:135m + llama-run hf://QuantFactory/SmolLM-135M-GGUF/SmolLM-135M.Q2_K.gguf + llama-run huggingface://bartowski/SmolLM-1.7B-Instruct-v0.2-GGUF/SmolLM-1.7B-Instruct-v0.2-IQ3_M.gguf + llama-run https://example.com/some-file1.gguf + llama-run some-file2.gguf + llama-run file://some-file3.gguf + llama-run --ngl 999 some-file4.gguf + llama-run --ngl 999 some-file5.gguf Hello World +``` diff --git a/examples/run/run.cpp b/examples/run/run.cpp index cac2faefcc2..f89d041c44a 100644 --- a/examples/run/run.cpp +++ b/examples/run/run.cpp @@ -1,192 +1,714 @@ #if defined(_WIN32) -#include +# include #else -#include +# include +# include +# include +#endif + +#if defined(LLAMA_USE_CURL) +# include #endif #include +#include #include #include +#include #include #include #include -#include #include +#include "common.h" +#include "json.hpp" #include "llama-cpp.h" -typedef std::unique_ptr char_array_ptr; +GGML_ATTRIBUTE_FORMAT(1, 2) +static std::string fmt(const char * fmt, ...) { + va_list ap; + va_list ap2; + va_start(ap, fmt); + va_copy(ap2, ap); + const int size = vsnprintf(NULL, 0, fmt, ap); + GGML_ASSERT(size >= 0 && size < INT_MAX); // NOLINT + std::string buf; + buf.resize(size); + const int size2 = vsnprintf(const_cast(buf.data()), buf.size() + 1, fmt, ap2); + GGML_ASSERT(size2 == size); + va_end(ap2); + va_end(ap); + + return buf; +} -struct Argument { - std::string flag; - std::string help_text; -}; +GGML_ATTRIBUTE_FORMAT(1, 2) +static int printe(const char * fmt, ...) { + va_list args; + va_start(args, fmt); + const int ret = vfprintf(stderr, fmt, args); + va_end(args); -struct Options { - std::string model_path, prompt_non_interactive; - int ngl = 99; - int n_ctx = 2048; -}; + return ret; +} -class ArgumentParser { - public: - ArgumentParser(const char * program_name) : program_name(program_name) {} +class Opt { + public: + int init(int argc, const char ** argv) { + ctx_params = llama_context_default_params(); + model_params = llama_model_default_params(); + context_size_default = ctx_params.n_batch; + ngl_default = model_params.n_gpu_layers; + common_params_sampling sampling; + temperature_default = sampling.temp; + + if (argc < 2) { + printe("Error: No arguments provided.\n"); + print_help(); + return 1; + } - void add_argument(const std::string & flag, std::string & var, const std::string & help_text = "") { - string_args[flag] = &var; - arguments.push_back({flag, help_text}); + // Parse arguments + if (parse(argc, argv)) { + printe("Error: Failed to parse arguments.\n"); + print_help(); + return 1; + } + + // If help is requested, show help and exit + if (help) { + print_help(); + return 2; + } + + ctx_params.n_batch = context_size >= 0 ? context_size : context_size_default; + model_params.n_gpu_layers = ngl >= 0 ? ngl : ngl_default; + temperature = temperature >= 0 ? temperature : temperature_default; + + return 0; // Success } - void add_argument(const std::string & flag, int & var, const std::string & help_text = "") { - int_args[flag] = &var; - arguments.push_back({flag, help_text}); + llama_context_params ctx_params; + llama_model_params model_params; + std::string model_; + std::string user; + int context_size = -1, ngl = -1; + float temperature = -1; + bool verbose = false; + + private: + int context_size_default = -1, ngl_default = -1; + float temperature_default = -1; + bool help = false; + + bool parse_flag(const char ** argv, int i, const char * short_opt, const char * long_opt) { + return strcmp(argv[i], short_opt) == 0 || strcmp(argv[i], long_opt) == 0; + } + + int handle_option_with_value(int argc, const char ** argv, int & i, int & option_value) { + if (i + 1 >= argc) { + return 1; + } + + option_value = std::atoi(argv[++i]); + + return 0; + } + + int handle_option_with_value(int argc, const char ** argv, int & i, float & option_value) { + if (i + 1 >= argc) { + return 1; + } + + option_value = std::atof(argv[++i]); + + return 0; } int parse(int argc, const char ** argv) { - for (int i = 1; i < argc; ++i) { - std::string arg = argv[i]; - if (string_args.count(arg)) { - if (i + 1 < argc) { - *string_args[arg] = argv[++i]; - } else { - fprintf(stderr, "error: missing value for %s\n", arg.c_str()); - print_usage(); + bool options_parsing = true; + for (int i = 1, positional_args_i = 0; i < argc; ++i) { + if (options_parsing && (strcmp(argv[i], "-c") == 0 || strcmp(argv[i], "--context-size") == 0)) { + if (handle_option_with_value(argc, argv, i, context_size) == 1) { + return 1; + } + } else if (options_parsing && (strcmp(argv[i], "-n") == 0 || strcmp(argv[i], "--ngl") == 0)) { + if (handle_option_with_value(argc, argv, i, ngl) == 1) { + return 1; + } + } else if (options_parsing && strcmp(argv[i], "--temp") == 0) { + if (handle_option_with_value(argc, argv, i, temperature) == 1) { return 1; } - } else if (int_args.count(arg)) { - if (i + 1 < argc) { - if (parse_int_arg(argv[++i], *int_args[arg]) != 0) { - fprintf(stderr, "error: invalid value for %s: %s\n", arg.c_str(), argv[i]); - print_usage(); - return 1; - } - } else { - fprintf(stderr, "error: missing value for %s\n", arg.c_str()); - print_usage(); + } else if (options_parsing && + (parse_flag(argv, i, "-v", "--verbose") || parse_flag(argv, i, "-v", "--log-verbose"))) { + verbose = true; + } else if (options_parsing && parse_flag(argv, i, "-h", "--help")) { + help = true; + return 0; + } else if (options_parsing && strcmp(argv[i], "--") == 0) { + options_parsing = false; + } else if (positional_args_i == 0) { + if (!argv[i][0] || argv[i][0] == '-') { return 1; } + + ++positional_args_i; + model_ = argv[i]; + } else if (positional_args_i == 1) { + ++positional_args_i; + user = argv[i]; } else { - fprintf(stderr, "error: unrecognized argument %s\n", arg.c_str()); - print_usage(); + user += " " + std::string(argv[i]); + } + } + + return 0; + } + + void print_help() const { + printf( + "Description:\n" + " Runs a llm\n" + "\n" + "Usage:\n" + " llama-run [options] model [prompt]\n" + "\n" + "Options:\n" + " -c, --context-size \n" + " Context size (default: %d)\n" + " -n, --ngl \n" + " Number of GPU layers (default: %d)\n" + " --temp \n" + " Temperature (default: %.1f)\n" + " -v, --verbose, --log-verbose\n" + " Set verbosity level to infinity (i.e. log all messages, useful for debugging)\n" + " -h, --help\n" + " Show help message\n" + "\n" + "Commands:\n" + " model\n" + " Model is a string with an optional prefix of \n" + " huggingface:// (hf://), ollama://, https:// or file://.\n" + " If no protocol is specified and a file exists in the specified\n" + " path, file:// is assumed, otherwise if a file does not exist in\n" + " the specified path, ollama:// is assumed. Models that are being\n" + " pulled are downloaded with .partial extension while being\n" + " downloaded and then renamed as the file without the .partial\n" + " extension when complete.\n" + "\n" + "Examples:\n" + " llama-run llama3\n" + " llama-run ollama://granite-code\n" + " llama-run ollama://smollm:135m\n" + " llama-run hf://QuantFactory/SmolLM-135M-GGUF/SmolLM-135M.Q2_K.gguf\n" + " llama-run " + "huggingface://bartowski/SmolLM-1.7B-Instruct-v0.2-GGUF/SmolLM-1.7B-Instruct-v0.2-IQ3_M.gguf\n" + " llama-run https://example.com/some-file1.gguf\n" + " llama-run some-file2.gguf\n" + " llama-run file://some-file3.gguf\n" + " llama-run --ngl 999 some-file4.gguf\n" + " llama-run --ngl 999 some-file5.gguf Hello World\n", + context_size_default, ngl_default, temperature_default); + } +}; + +struct progress_data { + size_t file_size = 0; + std::chrono::steady_clock::time_point start_time = std::chrono::steady_clock::now(); + bool printed = false; +}; + +static int get_terminal_width() { +#if defined(_WIN32) + CONSOLE_SCREEN_BUFFER_INFO csbi; + GetConsoleScreenBufferInfo(GetStdHandle(STD_OUTPUT_HANDLE), &csbi); + return csbi.srWindow.Right - csbi.srWindow.Left + 1; +#else + struct winsize w; + ioctl(STDOUT_FILENO, TIOCGWINSZ, &w); + return w.ws_col; +#endif +} + +#ifdef LLAMA_USE_CURL +class File { + public: + FILE * file = nullptr; + + FILE * open(const std::string & filename, const char * mode) { + file = fopen(filename.c_str(), mode); + + return file; + } + + int lock() { + if (file) { +# ifdef _WIN32 + fd = _fileno(file); + hFile = (HANDLE) _get_osfhandle(fd); + if (hFile == INVALID_HANDLE_VALUE) { + fd = -1; + + return 1; + } + + OVERLAPPED overlapped = { 0 }; + if (!LockFileEx(hFile, LOCKFILE_EXCLUSIVE_LOCK | LOCKFILE_FAIL_IMMEDIATELY, 0, MAXDWORD, MAXDWORD, + &overlapped)) { + fd = -1; + return 1; } +# else + fd = fileno(file); + if (flock(fd, LOCK_EX | LOCK_NB) != 0) { + fd = -1; + + return 1; + } +# endif } - if (string_args["-m"]->empty()) { - fprintf(stderr, "error: -m is required\n"); - print_usage(); + return 0; + } + + ~File() { + if (fd >= 0) { +# ifdef _WIN32 + if (hFile != INVALID_HANDLE_VALUE) { + OVERLAPPED overlapped = { 0 }; + UnlockFileEx(hFile, 0, MAXDWORD, MAXDWORD, &overlapped); + } +# else + flock(fd, LOCK_UN); +# endif + } + + if (file) { + fclose(file); + } + } + + private: + int fd = -1; +# ifdef _WIN32 + HANDLE hFile; +# endif +}; + +class HttpClient { + public: + int init(const std::string & url, const std::vector & headers, const std::string & output_file, + const bool progress, std::string * response_str = nullptr) { + std::string output_file_partial; + curl = curl_easy_init(); + if (!curl) { return 1; } + progress_data data; + File out; + if (!output_file.empty()) { + output_file_partial = output_file + ".partial"; + if (!out.open(output_file_partial, "ab")) { + printe("Failed to open file\n"); + + return 1; + } + + if (out.lock()) { + printe("Failed to exclusively lock file\n"); + + return 1; + } + } + + set_write_options(response_str, out); + data.file_size = set_resume_point(output_file_partial); + set_progress_options(progress, data); + set_headers(headers); + perform(url); + if (!output_file.empty()) { + std::filesystem::rename(output_file_partial, output_file); + } + return 0; } - private: - const char * program_name; - std::unordered_map string_args; - std::unordered_map int_args; - std::vector arguments; + ~HttpClient() { + if (chunk) { + curl_slist_free_all(chunk); + } + + if (curl) { + curl_easy_cleanup(curl); + } + } + + private: + CURL * curl = nullptr; + struct curl_slist * chunk = nullptr; + + void set_write_options(std::string * response_str, const File & out) { + if (response_str) { + curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, capture_data); + curl_easy_setopt(curl, CURLOPT_WRITEDATA, response_str); + } else { + curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, write_data); + curl_easy_setopt(curl, CURLOPT_WRITEDATA, out.file); + } + } + + size_t set_resume_point(const std::string & output_file) { + size_t file_size = 0; + if (std::filesystem::exists(output_file)) { + file_size = std::filesystem::file_size(output_file); + curl_easy_setopt(curl, CURLOPT_RESUME_FROM_LARGE, static_cast(file_size)); + } + + return file_size; + } + + void set_progress_options(bool progress, progress_data & data) { + if (progress) { + curl_easy_setopt(curl, CURLOPT_NOPROGRESS, 0L); + curl_easy_setopt(curl, CURLOPT_XFERINFODATA, &data); + curl_easy_setopt(curl, CURLOPT_XFERINFOFUNCTION, update_progress); + } + } + + void set_headers(const std::vector & headers) { + if (!headers.empty()) { + if (chunk) { + curl_slist_free_all(chunk); + chunk = 0; + } + + for (const auto & header : headers) { + chunk = curl_slist_append(chunk, header.c_str()); + } + + curl_easy_setopt(curl, CURLOPT_HTTPHEADER, chunk); + } + } + + void perform(const std::string & url) { + CURLcode res; + curl_easy_setopt(curl, CURLOPT_URL, url.c_str()); + curl_easy_setopt(curl, CURLOPT_FOLLOWLOCATION, 1L); + curl_easy_setopt(curl, CURLOPT_DEFAULT_PROTOCOL, "https"); + curl_easy_setopt(curl, CURLOPT_FAILONERROR, 1L); + res = curl_easy_perform(curl); + if (res != CURLE_OK) { + printe("curl_easy_perform() failed: %s\n", curl_easy_strerror(res)); + } + } + + static std::string human_readable_time(double seconds) { + int hrs = static_cast(seconds) / 3600; + int mins = (static_cast(seconds) % 3600) / 60; + int secs = static_cast(seconds) % 60; + + if (hrs > 0) { + return fmt("%dh %02dm %02ds", hrs, mins, secs); + } else if (mins > 0) { + return fmt("%dm %02ds", mins, secs); + } else { + return fmt("%ds", secs); + } + } + + static std::string human_readable_size(curl_off_t size) { + static const char * suffix[] = { "B", "KB", "MB", "GB", "TB" }; + char length = sizeof(suffix) / sizeof(suffix[0]); + int i = 0; + double dbl_size = size; + if (size > 1024) { + for (i = 0; (size / 1024) > 0 && i < length - 1; i++, size /= 1024) { + dbl_size = size / 1024.0; + } + } + + return fmt("%.2f %s", dbl_size, suffix[i]); + } - int parse_int_arg(const char * arg, int & value) { - char * end; - const long val = std::strtol(arg, &end, 10); - if (*end == '\0' && val >= INT_MIN && val <= INT_MAX) { - value = static_cast(val); + static int update_progress(void * ptr, curl_off_t total_to_download, curl_off_t now_downloaded, curl_off_t, + curl_off_t) { + progress_data * data = static_cast(ptr); + if (total_to_download <= 0) { return 0; } - return 1; + + total_to_download += data->file_size; + const curl_off_t now_downloaded_plus_file_size = now_downloaded + data->file_size; + const curl_off_t percentage = calculate_percentage(now_downloaded_plus_file_size, total_to_download); + std::string progress_prefix = generate_progress_prefix(percentage); + + const double speed = calculate_speed(now_downloaded, data->start_time); + const double tim = (total_to_download - now_downloaded) / speed; + std::string progress_suffix = + generate_progress_suffix(now_downloaded_plus_file_size, total_to_download, speed, tim); + + int progress_bar_width = calculate_progress_bar_width(progress_prefix, progress_suffix); + std::string progress_bar; + generate_progress_bar(progress_bar_width, percentage, progress_bar); + + print_progress(progress_prefix, progress_bar, progress_suffix); + data->printed = true; + + return 0; } - void print_usage() const { - printf("\nUsage:\n"); - printf(" %s [OPTIONS]\n\n", program_name); - printf("Options:\n"); - for (const auto & arg : arguments) { - printf(" %-10s %s\n", arg.flag.c_str(), arg.help_text.c_str()); + static curl_off_t calculate_percentage(curl_off_t now_downloaded_plus_file_size, curl_off_t total_to_download) { + return (now_downloaded_plus_file_size * 100) / total_to_download; + } + + static std::string generate_progress_prefix(curl_off_t percentage) { return fmt("%3ld%% |", percentage); } + + static double calculate_speed(curl_off_t now_downloaded, const std::chrono::steady_clock::time_point & start_time) { + const auto now = std::chrono::steady_clock::now(); + const std::chrono::duration elapsed_seconds = now - start_time; + return now_downloaded / elapsed_seconds.count(); + } + + static std::string generate_progress_suffix(curl_off_t now_downloaded_plus_file_size, curl_off_t total_to_download, + double speed, double estimated_time) { + const int width = 10; + return fmt("%*s/%*s%*s/s%*s", width, human_readable_size(now_downloaded_plus_file_size).c_str(), width, + human_readable_size(total_to_download).c_str(), width, human_readable_size(speed).c_str(), width, + human_readable_time(estimated_time).c_str()); + } + + static int calculate_progress_bar_width(const std::string & progress_prefix, const std::string & progress_suffix) { + int progress_bar_width = get_terminal_width() - progress_prefix.size() - progress_suffix.size() - 3; + if (progress_bar_width < 1) { + progress_bar_width = 1; } - printf("\n"); + return progress_bar_width; + } + + static std::string generate_progress_bar(int progress_bar_width, curl_off_t percentage, + std::string & progress_bar) { + const curl_off_t pos = (percentage * progress_bar_width) / 100; + for (int i = 0; i < progress_bar_width; ++i) { + progress_bar.append((i < pos) ? "█" : " "); + } + + return progress_bar; + } + + static void print_progress(const std::string & progress_prefix, const std::string & progress_bar, + const std::string & progress_suffix) { + printe("\r%*s\r%s%s| %s", get_terminal_width(), " ", progress_prefix.c_str(), progress_bar.c_str(), + progress_suffix.c_str()); + } + // Function to write data to a file + static size_t write_data(void * ptr, size_t size, size_t nmemb, void * stream) { + FILE * out = static_cast(stream); + return fwrite(ptr, size, nmemb, out); + } + + // Function to capture data into a string + static size_t capture_data(void * ptr, size_t size, size_t nmemb, void * stream) { + std::string * str = static_cast(stream); + str->append(static_cast(ptr), size * nmemb); + return size * nmemb; } }; +#endif class LlamaData { - public: - llama_model_ptr model; - llama_sampler_ptr sampler; - llama_context_ptr context; + public: + llama_model_ptr model; + llama_sampler_ptr sampler; + llama_context_ptr context; std::vector messages; + std::vector msg_strs; + std::vector fmtted; - int init(const Options & opt) { - model = initialize_model(opt.model_path, opt.ngl); + int init(Opt & opt) { + model = initialize_model(opt); if (!model) { return 1; } - context = initialize_context(model, opt.n_ctx); + context = initialize_context(model, opt); if (!context) { return 1; } - sampler = initialize_sampler(); + sampler = initialize_sampler(opt); return 0; } - private: - // Initializes the model and returns a unique pointer to it - llama_model_ptr initialize_model(const std::string & model_path, const int ngl) { - llama_model_params model_params = llama_model_default_params(); - model_params.n_gpu_layers = ngl; + private: +#ifdef LLAMA_USE_CURL + int download(const std::string & url, const std::vector & headers, const std::string & output_file, + const bool progress, std::string * response_str = nullptr) { + HttpClient http; + if (http.init(url, headers, output_file, progress, response_str)) { + return 1; + } + + return 0; + } +#else + int download(const std::string &, const std::vector &, const std::string &, const bool, + std::string * = nullptr) { + printe("%s: llama.cpp built without libcurl, downloading from an url not supported.\n", __func__); + return 1; + } +#endif + + int huggingface_dl(const std::string & model, const std::vector headers, const std::string & bn) { + // Find the second occurrence of '/' after protocol string + size_t pos = model.find('/'); + pos = model.find('/', pos + 1); + if (pos == std::string::npos) { + return 1; + } - llama_model_ptr model(llama_load_model_from_file(model_path.c_str(), model_params)); + const std::string hfr = model.substr(0, pos); + const std::string hff = model.substr(pos + 1); + const std::string url = "https://huggingface.co/" + hfr + "/resolve/main/" + hff; + return download(url, headers, bn, true); + } + + int ollama_dl(std::string & model, const std::vector headers, const std::string & bn) { + if (model.find('/') == std::string::npos) { + model = "library/" + model; + } + + std::string model_tag = "latest"; + size_t colon_pos = model.find(':'); + if (colon_pos != std::string::npos) { + model_tag = model.substr(colon_pos + 1); + model = model.substr(0, colon_pos); + } + + std::string manifest_url = "https://registry.ollama.ai/v2/" + model + "/manifests/" + model_tag; + std::string manifest_str; + const int ret = download(manifest_url, headers, "", false, &manifest_str); + if (ret) { + return ret; + } + + nlohmann::json manifest = nlohmann::json::parse(manifest_str); + std::string layer; + for (const auto & l : manifest["layers"]) { + if (l["mediaType"] == "application/vnd.ollama.image.model") { + layer = l["digest"]; + break; + } + } + + std::string blob_url = "https://registry.ollama.ai/v2/" + model + "/blobs/" + layer; + return download(blob_url, headers, bn, true); + } + + std::string basename(const std::string & path) { + const size_t pos = path.find_last_of("/\\"); + if (pos == std::string::npos) { + return path; + } + + return path.substr(pos + 1); + } + + int remove_proto(std::string & model_) { + const std::string::size_type pos = model_.find("://"); + if (pos == std::string::npos) { + return 1; + } + + model_ = model_.substr(pos + 3); // Skip past "://" + return 0; + } + + int resolve_model(std::string & model_) { + int ret = 0; + if (string_starts_with(model_, "file://") || std::filesystem::exists(model_)) { + remove_proto(model_); + + return ret; + } + + const std::string bn = basename(model_); + const std::vector headers = { "--header", + "Accept: application/vnd.docker.distribution.manifest.v2+json" }; + if (string_starts_with(model_, "hf://") || string_starts_with(model_, "huggingface://")) { + remove_proto(model_); + ret = huggingface_dl(model_, headers, bn); + } else if (string_starts_with(model_, "ollama://")) { + remove_proto(model_); + ret = ollama_dl(model_, headers, bn); + } else if (string_starts_with(model_, "https://")) { + download(model_, headers, bn, true); + } else { + ret = ollama_dl(model_, headers, bn); + } + + model_ = bn; + + return ret; + } + + // Initializes the model and returns a unique pointer to it + llama_model_ptr initialize_model(Opt & opt) { + ggml_backend_load_all(); + resolve_model(opt.model_); + printe( + "\r%*s" + "\rLoading model", + get_terminal_width(), " "); + llama_model_ptr model(llama_load_model_from_file(opt.model_.c_str(), opt.model_params)); if (!model) { - fprintf(stderr, "%s: error: unable to load model\n", __func__); + printe("%s: error: unable to load model from file: %s\n", __func__, opt.model_.c_str()); } + printe("\r%*s\r", static_cast(sizeof("Loading model")), " "); return model; } // Initializes the context with the specified parameters - llama_context_ptr initialize_context(const llama_model_ptr & model, const int n_ctx) { - llama_context_params ctx_params = llama_context_default_params(); - ctx_params.n_ctx = n_ctx; - ctx_params.n_batch = n_ctx; - - llama_context_ptr context(llama_new_context_with_model(model.get(), ctx_params)); + llama_context_ptr initialize_context(const llama_model_ptr & model, const Opt & opt) { + llama_context_ptr context(llama_new_context_with_model(model.get(), opt.ctx_params)); if (!context) { - fprintf(stderr, "%s: error: failed to create the llama_context\n", __func__); + printe("%s: error: failed to create the llama_context\n", __func__); } return context; } // Initializes and configures the sampler - llama_sampler_ptr initialize_sampler() { + llama_sampler_ptr initialize_sampler(const Opt & opt) { llama_sampler_ptr sampler(llama_sampler_chain_init(llama_sampler_chain_default_params())); llama_sampler_chain_add(sampler.get(), llama_sampler_init_min_p(0.05f, 1)); - llama_sampler_chain_add(sampler.get(), llama_sampler_init_temp(0.8f)); + llama_sampler_chain_add(sampler.get(), llama_sampler_init_temp(opt.temperature)); llama_sampler_chain_add(sampler.get(), llama_sampler_init_dist(LLAMA_DEFAULT_SEED)); return sampler; } }; -// Add a message to `messages` and store its content in `owned_content` -static void add_message(const char * role, const std::string & text, LlamaData & llama_data, - std::vector & owned_content) { - char_array_ptr content(new char[text.size() + 1]); - std::strcpy(content.get(), text.c_str()); - llama_data.messages.push_back({role, content.get()}); - owned_content.push_back(std::move(content)); +// Add a message to `messages` and store its content in `msg_strs` +static void add_message(const char * role, const std::string & text, LlamaData & llama_data) { + llama_data.msg_strs.push_back(std::move(text)); + llama_data.messages.push_back({ role, llama_data.msg_strs.back().c_str() }); } // Function to apply the chat template and resize `formatted` if needed -static int apply_chat_template(const LlamaData & llama_data, std::vector & formatted, const bool append) { - int result = llama_chat_apply_template(llama_data.model.get(), nullptr, llama_data.messages.data(), - llama_data.messages.size(), append, formatted.data(), formatted.size()); - if (result > static_cast(formatted.size())) { - formatted.resize(result); +static int apply_chat_template(LlamaData & llama_data, const bool append) { + int result = llama_chat_apply_template( + llama_data.model.get(), nullptr, llama_data.messages.data(), llama_data.messages.size(), append, + append ? llama_data.fmtted.data() : nullptr, append ? llama_data.fmtted.size() : 0); + if (append && result > static_cast(llama_data.fmtted.size())) { + llama_data.fmtted.resize(result); result = llama_chat_apply_template(llama_data.model.get(), nullptr, llama_data.messages.data(), - llama_data.messages.size(), append, formatted.data(), formatted.size()); + llama_data.messages.size(), append, llama_data.fmtted.data(), + llama_data.fmtted.size()); } return result; @@ -199,7 +721,8 @@ static int tokenize_prompt(const llama_model_ptr & model, const std::string & pr prompt_tokens.resize(n_prompt_tokens); if (llama_tokenize(model.get(), prompt.c_str(), prompt.size(), prompt_tokens.data(), prompt_tokens.size(), true, true) < 0) { - GGML_ABORT("failed to tokenize the prompt\n"); + printe("failed to tokenize the prompt\n"); + return -1; } return n_prompt_tokens; @@ -207,11 +730,11 @@ static int tokenize_prompt(const llama_model_ptr & model, const std::string & pr // Check if we have enough space in the context to evaluate this batch static int check_context_size(const llama_context_ptr & ctx, const llama_batch & batch) { - const int n_ctx = llama_n_ctx(ctx.get()); + const int n_ctx = llama_n_ctx(ctx.get()); const int n_ctx_used = llama_get_kv_cache_used_cells(ctx.get()); if (n_ctx_used + batch.n_tokens > n_ctx) { printf("\033[0m\n"); - fprintf(stderr, "context size exceeded\n"); + printe("context size exceeded\n"); return 1; } @@ -221,9 +744,10 @@ static int check_context_size(const llama_context_ptr & ctx, const llama_batch & // convert the token to a string static int convert_token_to_string(const llama_model_ptr & model, const llama_token token_id, std::string & piece) { char buf[256]; - int n = llama_token_to_piece(model.get(), token_id, buf, sizeof(buf), 0, true); + int n = llama_token_to_piece(model.get(), token_id, buf, sizeof(buf), 0, true); if (n < 0) { - GGML_ABORT("failed to convert token to piece\n"); + printe("failed to convert token to piece\n"); + return 1; } piece = std::string(buf, n); @@ -238,19 +762,19 @@ static void print_word_and_concatenate_to_response(const std::string & piece, st // helper function to evaluate a prompt and generate a response static int generate(LlamaData & llama_data, const std::string & prompt, std::string & response) { - std::vector prompt_tokens; - const int n_prompt_tokens = tokenize_prompt(llama_data.model, prompt, prompt_tokens); - if (n_prompt_tokens < 0) { + std::vector tokens; + if (tokenize_prompt(llama_data.model, prompt, tokens) < 0) { return 1; } // prepare a batch for the prompt - llama_batch batch = llama_batch_get_one(prompt_tokens.data(), prompt_tokens.size()); + llama_batch batch = llama_batch_get_one(tokens.data(), tokens.size()); llama_token new_token_id; while (true) { check_context_size(llama_data.context, batch); if (llama_decode(llama_data.context.get(), batch)) { - GGML_ABORT("failed to decode\n"); + printe("failed to decode\n"); + return 1; } // sample the next token, check is it an end of generation? @@ -273,44 +797,34 @@ static int generate(LlamaData & llama_data, const std::string & prompt, std::str return 0; } -static int parse_arguments(const int argc, const char ** argv, Options & opt) { - ArgumentParser parser(argv[0]); - parser.add_argument("-m", opt.model_path, "model"); - parser.add_argument("-p", opt.prompt_non_interactive, "prompt"); - parser.add_argument("-c", opt.n_ctx, "context_size"); - parser.add_argument("-ngl", opt.ngl, "n_gpu_layers"); - if (parser.parse(argc, argv)) { - return 1; - } - - return 0; -} - static int read_user_input(std::string & user) { std::getline(std::cin, user); - return user.empty(); // Indicate an error or empty input + return user.empty(); // Should have data in happy path } // Function to generate a response based on the prompt -static int generate_response(LlamaData & llama_data, const std::string & prompt, std::string & response) { +static int generate_response(LlamaData & llama_data, const std::string & prompt, std::string & response, + const bool stdout_a_terminal) { // Set response color - printf("\033[33m"); + if (stdout_a_terminal) { + printf("\033[33m"); + } + if (generate(llama_data, prompt, response)) { - fprintf(stderr, "failed to generate response\n"); + printe("failed to generate response\n"); return 1; } // End response with color reset and newline - printf("\n\033[0m"); + printf("\n%s", stdout_a_terminal ? "\033[0m" : ""); return 0; } // Helper function to apply the chat template and handle errors -static int apply_chat_template_with_error_handling(const LlamaData & llama_data, std::vector & formatted, - const bool is_user_input, int & output_length) { - const int new_len = apply_chat_template(llama_data, formatted, is_user_input); +static int apply_chat_template_with_error_handling(LlamaData & llama_data, const bool append, int & output_length) { + const int new_len = apply_chat_template(llama_data, append); if (new_len < 0) { - fprintf(stderr, "failed to apply the chat template\n"); + printe("failed to apply the chat template\n"); return -1; } @@ -319,62 +833,82 @@ static int apply_chat_template_with_error_handling(const LlamaData & llama_data, } // Helper function to handle user input -static bool handle_user_input(std::string & user_input, const std::string & prompt_non_interactive) { - if (!prompt_non_interactive.empty()) { - user_input = prompt_non_interactive; - return true; // No need for interactive input +static int handle_user_input(std::string & user_input, const std::string & user) { + if (!user.empty()) { + user_input = user; + return 0; // No need for interactive input } - printf("\033[32m> \033[0m"); - return !read_user_input(user_input); // Returns false if input ends the loop + printf( + "\r%*s" + "\r\033[32m> \033[0m", + get_terminal_width(), " "); + return read_user_input(user_input); // Returns true if input ends the loop +} + +static bool is_stdin_a_terminal() { +#if defined(_WIN32) + HANDLE hStdin = GetStdHandle(STD_INPUT_HANDLE); + DWORD mode; + return GetConsoleMode(hStdin, &mode); +#else + return isatty(STDIN_FILENO); +#endif +} + +static bool is_stdout_a_terminal() { +#if defined(_WIN32) + HANDLE hStdout = GetStdHandle(STD_OUTPUT_HANDLE); + DWORD mode; + return GetConsoleMode(hStdout, &mode); +#else + return isatty(STDOUT_FILENO); +#endif } // Function to tokenize the prompt -static int chat_loop(LlamaData & llama_data, std::string & prompt_non_interactive) { - std::vector owned_content; - std::vector fmtted(llama_n_ctx(llama_data.context.get())); +static int chat_loop(LlamaData & llama_data, const std::string & user) { int prev_len = 0; - + llama_data.fmtted.resize(llama_n_ctx(llama_data.context.get())); + static const bool stdout_a_terminal = is_stdout_a_terminal(); while (true) { // Get user input std::string user_input; - if (!handle_user_input(user_input, prompt_non_interactive)) { - break; + while (handle_user_input(user_input, user)) { } - add_message("user", prompt_non_interactive.empty() ? user_input : prompt_non_interactive, llama_data, - owned_content); - + add_message("user", user.empty() ? user_input : user, llama_data); int new_len; - if (apply_chat_template_with_error_handling(llama_data, fmtted, true, new_len) < 0) { + if (apply_chat_template_with_error_handling(llama_data, true, new_len) < 0) { return 1; } - std::string prompt(fmtted.begin() + prev_len, fmtted.begin() + new_len); + std::string prompt(llama_data.fmtted.begin() + prev_len, llama_data.fmtted.begin() + new_len); std::string response; - if (generate_response(llama_data, prompt, response)) { + if (generate_response(llama_data, prompt, response, stdout_a_terminal)) { + return 1; + } + + if (!user.empty()) { + break; + } + + add_message("assistant", response, llama_data); + if (apply_chat_template_with_error_handling(llama_data, false, prev_len) < 0) { return 1; } } + return 0; } -static void log_callback(const enum ggml_log_level level, const char * text, void *) { - if (level == GGML_LOG_LEVEL_ERROR) { - fprintf(stderr, "%s", text); +static void log_callback(const enum ggml_log_level level, const char * text, void * p) { + const Opt * opt = static_cast(p); + if (opt->verbose || level == GGML_LOG_LEVEL_ERROR) { + printe("%s", text); } } -static bool is_stdin_a_terminal() { -#if defined(_WIN32) - HANDLE hStdin = GetStdHandle(STD_INPUT_HANDLE); - DWORD mode; - return GetConsoleMode(hStdin, &mode); -#else - return isatty(STDIN_FILENO); -#endif -} - static std::string read_pipe_data() { std::ostringstream result; result << std::cin.rdbuf(); // Read all data from std::cin @@ -382,26 +916,29 @@ static std::string read_pipe_data() { } int main(int argc, const char ** argv) { - Options opt; - if (parse_arguments(argc, argv, opt)) { + Opt opt; + const int ret = opt.init(argc, argv); + if (ret == 2) { + return 0; + } else if (ret) { return 1; } if (!is_stdin_a_terminal()) { - if (!opt.prompt_non_interactive.empty()) { - opt.prompt_non_interactive += "\n\n"; + if (!opt.user.empty()) { + opt.user += "\n\n"; } - opt.prompt_non_interactive += read_pipe_data(); + opt.user += read_pipe_data(); } - llama_log_set(log_callback, nullptr); + llama_log_set(log_callback, &opt); LlamaData llama_data; if (llama_data.init(opt)) { return 1; } - if (chat_loop(llama_data, opt.prompt_non_interactive)) { + if (chat_loop(llama_data, opt.user)) { return 1; } diff --git a/examples/server/CMakeLists.txt b/examples/server/CMakeLists.txt index 63fca1d590f..1b7cc8c1328 100644 --- a/examples/server/CMakeLists.txt +++ b/examples/server/CMakeLists.txt @@ -15,7 +15,7 @@ set(TARGET_SRCS httplib.h ) set(PUBLIC_ASSETS - index.html + index.html.gz loading.html ) @@ -34,6 +34,7 @@ endforeach() add_executable(${TARGET} ${TARGET_SRCS}) install(TARGETS ${TARGET} RUNTIME) +target_include_directories(${TARGET} PRIVATE ${CMAKE_SOURCE_DIR}) target_link_libraries(${TARGET} PRIVATE common ${CMAKE_THREAD_LIBS_INIT}) if (LLAMA_SERVER_SSL) diff --git a/examples/server/README.md b/examples/server/README.md index 6294f541fc7..c7d91be9976 100644 --- a/examples/server/README.md +++ b/examples/server/README.md @@ -62,8 +62,8 @@ The project is under active development, and we are [looking for feedback and co | `--yarn-beta-fast N` | YaRN: low correction dim or beta (default: 32.0)
(env: LLAMA_ARG_YARN_BETA_FAST) | | `-dkvc, --dump-kv-cache` | verbose print of the KV cache | | `-nkvo, --no-kv-offload` | disable KV offload
(env: LLAMA_ARG_NO_KV_OFFLOAD) | -| `-ctk, --cache-type-k TYPE` | KV cache data type for K (default: f16)
(env: LLAMA_ARG_CACHE_TYPE_K) | -| `-ctv, --cache-type-v TYPE` | KV cache data type for V (default: f16)
(env: LLAMA_ARG_CACHE_TYPE_V) | +| `-ctk, --cache-type-k TYPE` | KV cache data type for K
allowed values: f32, f16, bf16, q8_0, q4_0, q4_1, iq4_nl, q5_0, q5_1
(default: f16)
(env: LLAMA_ARG_CACHE_TYPE_K) | +| `-ctv, --cache-type-v TYPE` | KV cache data type for V
allowed values: f32, f16, bf16, q8_0, q4_0, q4_1, iq4_nl, q5_0, q5_1
(default: f16)
(env: LLAMA_ARG_CACHE_TYPE_V) | | `-dt, --defrag-thold N` | KV cache defragmentation threshold (default: 0.1, < 0 - disabled)
(env: LLAMA_ARG_DEFRAG_THOLD) | | `-np, --parallel N` | number of parallel sequences to decode (default: 1)
(env: LLAMA_ARG_N_PARALLEL) | | `--mlock` | force system to keep model in RAM rather than swapping or compressing
(env: LLAMA_ARG_MLOCK) | @@ -104,7 +104,6 @@ The project is under active development, and we are [looking for feedback and co | `-s, --seed SEED` | RNG seed (default: -1, use random seed for -1) | | `--sampling-seq SEQUENCE` | simplified sequence for samplers that will be used (default: dkypmxt) | | `--ignore-eos` | ignore end of stream token and continue generating (implies --logit-bias EOS-inf) | -| `--penalize-nl` | penalize newline tokens (default: false) | | `--temp N` | temperature (default: 0.8) | | `--top-k N` | top-k sampling (default: 40, 0 = disabled) | | `--top-p N` | top-p sampling (default: 0.9, 1.0 = disabled) | @@ -138,6 +137,7 @@ The project is under active development, and we are [looking for feedback and co | -------- | ----------- | | `--no-context-shift` | disables context shift on inifinite text generation (default: disabled)
(env: LLAMA_ARG_NO_CONTEXT_SHIFT) | | `-sp, --special` | special tokens output enabled (default: false) | +| `--no-warmup` | skip warming up the model with an empty run | | `--spm-infill` | use Suffix/Prefix/Middle pattern for infill (instead of Prefix/Suffix/Middle) as some models prefer this. (default: disabled) | | `--pooling {none,mean,cls,last,rank}` | pooling type for embeddings, use model default if unspecified
(env: LLAMA_ARG_POOLING) | | `-cb, --cont-batching` | enable continuous batching (a.k.a dynamic batching) (default: enabled)
(env: LLAMA_ARG_CONT_BATCHING) | @@ -146,7 +146,7 @@ The project is under active development, and we are [looking for feedback and co | `--host HOST` | ip address to listen (default: 127.0.0.1)
(env: LLAMA_ARG_HOST) | | `--port PORT` | port to listen (default: 8080)
(env: LLAMA_ARG_PORT) | | `--path PATH` | path to serve static files from (default: )
(env: LLAMA_ARG_STATIC_PATH) | -| `--no-webui` | disable the Web UI
(env: LLAMA_ARG_NO_WEBUI) | +| `--no-webui` | Disable the Web UI (default: enabled)
(env: LLAMA_ARG_NO_WEBUI) | | `--embedding, --embeddings` | restrict to only support embedding use case; use only with dedicated embedding models (default: disabled)
(env: LLAMA_ARG_EMBEDDINGS) | | `--reranking, --rerank` | enable reranking endpoint on server (default: disabled)
(env: LLAMA_ARG_RERANKING) | | `--api-key KEY` | API key to use for authentication (default: none)
(env: LLAMA_API_KEY) | @@ -164,13 +164,13 @@ The project is under active development, and we are [looking for feedback and co | `--chat-template JINJA_TEMPLATE` | set custom jinja chat template (default: template taken from model's metadata)
if suffix/prefix are specified, template will be disabled
list of built-in templates:
chatglm3, chatglm4, chatml, command-r, deepseek, deepseek2, exaone3, gemma, granite, llama2, llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, monarch, openchat, orion, phi3, rwkv-world, vicuna, vicuna-orca, zephyr
(env: LLAMA_ARG_CHAT_TEMPLATE) | | `-sps, --slot-prompt-similarity SIMILARITY` | how much the prompt of a request must match the prompt of a slot in order to use that slot (default: 0.50, 0.0 = disabled)
| | `--lora-init-without-apply` | load LoRA adapters without applying them (apply later via POST /lora-adapters) (default: disabled) | -| `--draft-max, --draft, --draft-n N` | number of tokens to draft for speculative decoding (default: 16) | -| `--draft-min, --draft-n-min N` | minimum number of draft tokens to use for speculative decoding (default: 5) | -| `--draft-p-min P` | minimum speculative decoding probability (greedy) (default: 0.9) | -| `-cd, --ctx-size-draft N` | size of the prompt context for the draft model (default: 0, 0 = loaded from model) | +| `--draft-max, --draft, --draft-n N` | number of tokens to draft for speculative decoding (default: 16)
(env: LLAMA_ARG_DRAFT_MAX) | +| `--draft-min, --draft-n-min N` | minimum number of draft tokens to use for speculative decoding (default: 5)
(env: LLAMA_ARG_DRAFT_MIN) | +| `--draft-p-min P` | minimum speculative decoding probability (greedy) (default: 0.9)
(env: LLAMA_ARG_DRAFT_P_MIN) | +| `-cd, --ctx-size-draft N` | size of the prompt context for the draft model (default: 0, 0 = loaded from model)
(env: LLAMA_ARG_CTX_SIZE_DRAFT) | | `-devd, --device-draft ` | comma-separated list of devices to use for offloading the draft model (none = don't offload)
use --list-devices to see a list of available devices | -| `-ngld, --gpu-layers-draft, --n-gpu-layers-draft N` | number of layers to store in VRAM for the draft model | -| `-md, --model-draft FNAME` | draft model for speculative decoding (default: unused) | +| `-ngld, --gpu-layers-draft, --n-gpu-layers-draft N` | number of layers to store in VRAM for the draft model
(env: LLAMA_ARG_N_GPU_LAYERS_DRAFT) | +| `-md, --model-draft FNAME` | draft model for speculative decoding (default: unused)
(env: LLAMA_ARG_MODEL_DRAFT) | Note: If both command line argument and environment variable are both set for the same param, the argument will take precedence over env var. @@ -303,23 +303,23 @@ mkdir llama-client cd llama-client ``` -Create a index.js file and put this inside: +Create an index.js file and put this inside: ```javascript -const prompt = `Building a website can be done in 10 simple steps:`; +const prompt = "Building a website can be done in 10 simple steps:" -async function Test() { +async function test() { let response = await fetch("http://127.0.0.1:8080/completion", { - method: 'POST', + method: "POST", body: JSON.stringify({ prompt, - n_predict: 512, + n_predict: 64, }) }) console.log((await response.json()).content) } -Test() +test() ``` And run it: @@ -343,6 +343,10 @@ node index.js ### POST `/completion`: Given a `prompt`, it returns the predicted completion. +> [!IMPORTANT] +> +> This endpoint is **not** OAI-compatible + *Options:* `prompt`: Provide the prompt for this completion as a string or as an array of strings or numbers representing tokens. Internally, if `cache_prompt` is `true`, the prompt is compared to the previous completion and only the "unseen" suffix is evaluated. A `BOS` token is inserted at the start, if all of the following conditions are true: @@ -381,7 +385,7 @@ Multiple prompts are also supported. In this case, the completion result will be `n_keep`: Specify the number of tokens from the prompt to retain when the context size is exceeded and tokens need to be discarded. The number excludes the BOS token. By default, this value is set to `0`, meaning no tokens are kept. Use `-1` to retain all tokens from the prompt. -`stream`: It allows receiving each predicted token in real-time instead of waiting for the completion to finish. To enable this, set to `true`. +`stream`: Allows receiving each predicted token in real-time instead of waiting for the completion to finish (uses a different response format). To enable this, set to `true`. `stop`: Specify a JSON array of stopping strings. These words will not be included in the completion, so make sure to add them to the prompt for the next iteration. Default: `[]` @@ -392,8 +396,6 @@ These words will not be included in the completion, so make sure to add them to `repeat_last_n`: Last n tokens to consider for penalizing repetition. Default: `64`, where `0` is disabled and `-1` is ctx-size. -`penalize_nl`: Penalize newline tokens when applying the repeat penalty. Default: `true` - `presence_penalty`: Repeat alpha presence penalty. Default: `0.0`, which is disabled. `frequency_penalty`: Repeat alpha frequency penalty. Default: `0.0`, which is disabled. @@ -440,40 +442,76 @@ These words will not be included in the completion, so make sure to add them to `cache_prompt`: Re-use KV cache from a previous request if possible. This way the common prefix does not have to be re-processed, only the suffix that differs between the requests. Because (depending on the backend) the logits are **not** guaranteed to be bit-for-bit identical for different batch sizes (prompt processing vs. token generation) enabling this option can cause nondeterministic results. Default: `true` +`return_tokens`: Return the raw generated token ids in the `tokens` field. Otherwise `tokens` remains empty. Default: `false` + `samplers`: The order the samplers should be applied in. An array of strings representing sampler type names. If a sampler is not set, it will not be used. If a sampler is specified more than once, it will be applied multiple times. Default: `["dry", "top_k", "typ_p", "top_p", "min_p", "xtc", "temperature"]` - these are all the available values. - `timings_per_token`: Include prompt processing and text generation speed information in each response. Default: `false` +`timings_per_token`: Include prompt processing and text generation speed information in each response. Default: `false` -**Response format** +`post_sampling_probs`: Returns the probabilities of top `n_probs` tokens after applying sampling chain. -- Note: When using streaming mode (`stream`), only `content` and `stop` will be returned until end of completion. +`response_fields`: A list of response fields, for example: `"response_fields": ["content", "generation_settings/n_predict"]`. If the specified field is missing, it will simply be omitted from the response without triggering an error. -- `completion_probabilities`: An array of token probabilities for each completion. The array's length is `n_predict`. Each item in the array has the following structure: +**Response format** -```json -{ - "content": "", - "probs": [ - { - "prob": float, - "tok_str": "" - }, - { - "prob": float, - "tok_str": "" - }, - ... - ] -}, -``` +- Note: In streaming mode (`stream`), only `content`, `tokens` and `stop` will be returned until end of completion. Responses are sent using the [Server-sent events](https://html.spec.whatwg.org/multipage/server-sent-events.html) standard. Note: the browser's `EventSource` interface cannot be used due to its lack of `POST` request support. -Notice that each `probs` is an array of length `n_probs`. +- `completion_probabilities`: An array of token probabilities for each completion. The array's length is `n_predict`. Each item in the array has a nested array `top_logprobs`. It contains at **maximum** `n_probs` elements: + ```json + { + "content": "", + "tokens": [ generated token ids if requested ], + ... + "probs": [ + { + "id": , + "logprob": float, + "token": "", + "bytes": [int, int, ...], + "top_logprobs": [ + { + "id": , + "logprob": float, + "token": "", + "bytes": [int, int, ...], + }, + { + "id": , + "logprob": float, + "token": "", + "bytes": [int, int, ...], + }, + ... + ] + }, + { + "id": , + "logprob": float, + "token": "", + "bytes": [int, int, ...], + "top_logprobs": [ + ... + ] + }, + ... + ] + }, + ``` + Please note that if `post_sampling_probs` is set to `true`: + - `logprob` will be replaced with `prob`, with the value between 0.0 and 1.0 + - `top_logprobs` will be replaced with `top_probs`. Each element contains: + - `id`: token ID + - `token`: token in string + - `bytes`: token in bytes + - `prob`: token probability, with the value between 0.0 and 1.0 + - Number of elements in `top_probs` may be less than `n_probs` - `content`: Completion result as a string (excluding `stopping_word` if any). In case of streaming mode, will contain the next token as a string. +- `tokens`: Same as `content` but represented as raw token ids. Only populated if `"return_tokens": true` or `"stream": true` in the request. - `stop`: Boolean for use with `stream` to check whether the generation has stopped (Note: This is not related to stopping words array `stop` from input options) - `generation_settings`: The provided options above excluding `prompt` but including `n_ctx`, `model`. These options may differ from the original ones in some way (e.g. bad values filtered out, strings converted to tokens, etc.). -- `model`: The path to the model loaded with `-m` -- `prompt`: The provided `prompt` +- `model`: The model alias (for model path, please use `/props` endpoint) +- `prompt`: The processed `prompt` (special tokens may be added) - `stop_type`: Indicating whether the completion has stopped. Possible values are: - `none`: Generating (not stopped) - `eos`: Stopped because it encountered the EOS token @@ -654,7 +692,6 @@ This endpoint is public (no API key check). By default, it is read-only. To make "mirostat": 0, "mirostat_tau": 5.0, "mirostat_eta": 0.10000000149011612, - "penalize_nl": false, "stop": [], "max_tokens": -1, "n_keep": 0, @@ -689,7 +726,8 @@ This endpoint is public (no API key check). By default, it is read-only. To make }, "total_slots": 1, "model_path": "../models/Meta-Llama-3.1-8B-Instruct-Q4_K_M.gguf", - "chat_template": "..." + "chat_template": "...", + "build_info": "b(build number)-(build commit hash)" } ``` @@ -762,6 +800,8 @@ curl http://localhost:8080/v1/chat/completions \ ### POST `/v1/embeddings`: OpenAI-compatible embeddings API +This endpoint requires that the model uses a pooling different than type `none`. The embeddings are normalized using the Eucledian norm. + *Options:* See [OpenAI Embeddings API documentation](https://platform.openai.com/docs/api-reference/embeddings). @@ -794,6 +834,46 @@ See [OpenAI Embeddings API documentation](https://platform.openai.com/docs/api-r }' ``` +### POST `/embeddings`: non-OpenAI-compatible embeddings API + +This endpoint supports all poolings, including `--pooling none`. When the pooling is `none`, the responses will contain the *unnormalized* embeddings for *all* input tokens. For all other pooling types, only the pooled embeddings are returned, normalized using Euclidian norm. + +Note that the response format of this endpoint is different from `/v1/embeddings`. + +*Options:* + +Same as the `/v1/embeddings` endpoint. + +*Examples:* + +Same as the `/v1/embeddings` endpoint. + +**Response format** + +```json +[ + { + "index": 0, + "embedding": [ + [ ... embeddings for token 0 ... ], + [ ... embeddings for token 1 ... ], + [ ... ] + [ ... embeddings for token N-1 ... ], + ] + }, + ... + { + "index": P, + "embedding": [ + [ ... embeddings for token 0 ... ], + [ ... embeddings for token 1 ... ], + [ ... ] + [ ... embeddings for token N-1 ... ], + ] + } +] +``` + ### GET `/slots`: Returns the current slots processing state > [!WARNING] @@ -844,7 +924,6 @@ Example: "mirostat": 0, "mirostat_tau": 5.0, "mirostat_eta": 0.10000000149011612, - "penalize_nl": false, "stop": [], "max_tokens": -1, "n_keep": 0, diff --git a/examples/server/public/index.html b/examples/server/public/index.html deleted file mode 100644 index 250729a4412..00000000000 --- a/examples/server/public/index.html +++ /dev/null @@ -1,351 +0,0 @@ - - - - - - - - 🦙 llama.cpp - chat - - - - - -
-
- - - -
- -
-
-

Conversations

- - - -
- - -
- + New conversation -
-
- {{ conv.messages[0].content }} -
-
- Conversations are saved to browser's localStorage -
-
-
- - -
- -
- - - -
llama.cpp
- - -
- - - - - -
-
- - -
-
- - {{ messages.length === 0 ? 'Send a message to start' : '' }} -
-
-
-
- - - - -
-
- - -
- - - - - -
-
- - -
-
- - -
-
-
- - -
- - - -
-
- -
- - - - - - - -
- - - - - - - diff --git a/examples/server/public/index.html.gz b/examples/server/public/index.html.gz new file mode 100644 index 00000000000..36f9c9fe9a6 Binary files /dev/null and b/examples/server/public/index.html.gz differ diff --git a/examples/server/public_legacy/index-new.html b/examples/server/public_legacy/index-new.html index 8bfa380e573..cbfbbdf2806 100644 --- a/examples/server/public_legacy/index-new.html +++ b/examples/server/public_legacy/index-new.html @@ -39,7 +39,6 @@ temperature: 0.8, // adapt all following parameters to optimized min-p requierements. If for non-english, set to 0.6 or lower repeat_last_n: 0, // 0 = disable penalty, -1 = context size repeat_penalty: 1.0, // 1.0 = disabled - penalize_nl: false, // true only useful for infinite completion dry_multiplier: 0.0, // 0.0 = disabled, 0.8 works well dry_base: 1.75, // 0.0 = disabled dry_allowed_length: 2, // tokens extending repetitions beyond this receive penalty, 2 works well diff --git a/examples/server/public_legacy/index.html b/examples/server/public_legacy/index.html index a95f5c6df87..75f39330a78 100644 --- a/examples/server/public_legacy/index.html +++ b/examples/server/public_legacy/index.html @@ -303,7 +303,6 @@ temperature: 0.7, repeat_last_n: 256, // 0 = disable penalty, -1 = context size repeat_penalty: 1.18, // 1.0 = disabled - penalize_nl: false, dry_multiplier: 0.0, // 0.0 = disabled, 0.8 works well dry_base: 1.75, // 0.0 = disabled dry_allowed_length: 2, // tokens extending repetitions beyond this receive penalty, 2 works well @@ -1006,7 +1005,6 @@ ${FloatField({ label: "Temperature", max: 2.0, min: 0.0, name: "temperature", step: 0.01, value: params.value.temperature })} ${FloatField({ label: "Penalize repeat sequence", max: 2.0, min: 0.0, name: "repeat_penalty", step: 0.01, value: params.value.repeat_penalty })} ${IntField({ label: "Consider N tokens for penalize", max: 2048, min: 0, name: "repeat_last_n", value: params.value.repeat_last_n })} - ${BoolField({ label: "Penalize repetition of newlines", name: "penalize_nl", value: params.value.penalize_nl })} ${IntField({ label: "Top-K sampling", max: 100, min: -1, name: "top_k", value: params.value.top_k })} ${FloatField({ label: "Top-P sampling", max: 1.0, min: 0.0, name: "top_p", step: 0.01, value: params.value.top_p })} ${FloatField({ label: "Min-P sampling", max: 1.0, min: 0.0, name: "min_p", step: 0.01, value: params.value.min_p })} diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 8cb992470a3..30ff3b14957 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -15,7 +15,7 @@ #define MIMETYPE_JSON "application/json; charset=utf-8" // auto generated files (update with ./deps.sh) -#include "index.html.hpp" +#include "index.html.gz.hpp" #include "loading.html.hpp" #include @@ -79,8 +79,9 @@ enum error_type { }; struct slot_params { - bool stream = true; - bool cache_prompt = true; // remember the prompt to avoid reprocessing all prompt + bool stream = true; + bool cache_prompt = true; // remember the prompt to avoid reprocessing all prompt + bool return_tokens = false; int32_t n_keep = 0; // number of tokens to keep from initial prompt int32_t n_discard = 0; // number of tokens after n_keep that may be discarded when shifting context, 0 defaults to half @@ -91,7 +92,9 @@ struct slot_params { int64_t t_max_predict_ms = -1; // if positive, limit the generation phase to this time limit std::vector antiprompt; + std::vector response_fields; bool timings_per_token = false; + bool post_sampling_probs = false; bool ignore_eos = false; struct common_params_sampling sampling; @@ -135,7 +138,6 @@ struct slot_params { {"mirostat", sampling.mirostat}, {"mirostat_tau", sampling.mirostat_tau}, {"mirostat_eta", sampling.mirostat_eta}, - {"penalize_nl", sampling.penalize_nl}, {"stop", antiprompt}, {"max_tokens", n_predict}, // User configured n_predict {"n_keep", n_keep}, @@ -151,6 +153,7 @@ struct slot_params { {"speculative.n_min", speculative.n_min}, {"speculative.p_min", speculative.p_min}, {"timings_per_token", timings_per_token}, + {"post_sampling_probs", post_sampling_probs}, }; } }; @@ -184,6 +187,7 @@ struct server_task { static slot_params params_from_json_cmpl( const llama_model * model, + const llama_context * ctx, const common_params & params_base, const json & data) { slot_params params; @@ -199,12 +203,14 @@ struct server_task { params.stream = json_value(data, "stream", false); params.cache_prompt = json_value(data, "cache_prompt", true); + params.return_tokens = json_value(data, "return_tokens", false); params.n_predict = json_value(data, "n_predict", json_value(data, "max_tokens", defaults.n_predict)); params.n_indent = json_value(data, "n_indent", defaults.n_indent); params.n_keep = json_value(data, "n_keep", defaults.n_keep); params.n_discard = json_value(data, "n_discard", defaults.n_discard); //params.t_max_prompt_ms = json_value(data, "t_max_prompt_ms", defaults.t_max_prompt_ms); // TODO: implement params.t_max_predict_ms = json_value(data, "t_max_predict_ms", defaults.t_max_predict_ms); + params.response_fields = json_value(data, "response_fields", std::vector()); params.sampling.top_k = json_value(data, "top_k", defaults.sampling.top_k); params.sampling.top_p = json_value(data, "top_p", defaults.sampling.top_p); @@ -226,10 +232,10 @@ struct server_task { params.sampling.mirostat = json_value(data, "mirostat", defaults.sampling.mirostat); params.sampling.mirostat_tau = json_value(data, "mirostat_tau", defaults.sampling.mirostat_tau); params.sampling.mirostat_eta = json_value(data, "mirostat_eta", defaults.sampling.mirostat_eta); - params.sampling.penalize_nl = json_value(data, "penalize_nl", defaults.sampling.penalize_nl); params.sampling.seed = json_value(data, "seed", defaults.sampling.seed); params.sampling.n_probs = json_value(data, "n_probs", defaults.sampling.n_probs); params.sampling.min_keep = json_value(data, "min_keep", defaults.sampling.min_keep); + params.post_sampling_probs = json_value(data, "post_sampling_probs", defaults.post_sampling_probs); params.speculative.n_min = json_value(data, "speculative.n_min", defaults.speculative.n_min); params.speculative.n_max = json_value(data, "speculative.n_max", defaults.speculative.n_max); @@ -239,8 +245,27 @@ struct server_task { params.speculative.n_min = std::max(params.speculative.n_min, 2); params.speculative.n_max = std::max(params.speculative.n_max, 0); + // TODO: add more sanity checks for the input parameters + + if (params.sampling.penalty_last_n < -1) { + throw std::runtime_error("Error: repeat_last_n must be >= -1"); + } + + if (params.sampling.dry_penalty_last_n < -1) { + throw std::runtime_error("Error: dry_penalty_last_n must be >= -1"); + } + + if (params.sampling.penalty_last_n == -1) { + // note: should be the slot's context and not the full context, but it's ok + params.sampling.penalty_last_n = llama_n_ctx(ctx); + } + + if (params.sampling.dry_penalty_last_n == -1) { + params.sampling.dry_penalty_last_n = llama_n_ctx(ctx); + } + if (params.sampling.dry_base < 1.0f) { - params.sampling.dry_base = defaults.sampling.dry_base; + params.sampling.dry_base = defaults.sampling.dry_base; } // sequence breakers for DRY @@ -416,41 +441,75 @@ inline std::string stop_type_to_str(stop_type type) { struct completion_token_output { llama_token tok; + float prob; std::string text_to_send; - struct token_prob { + struct prob_info { llama_token tok; - std::string tok_str; + std::string txt; float prob; }; - std::vector probs; + std::vector probs; - json to_json() const { + json to_json(bool post_sampling_probs) const { json probs_for_token = json::array(); for (const auto & p : probs) { + std::string txt(p.txt); + txt.resize(validate_utf8(txt)); probs_for_token.push_back(json { - {"tok_str", p.tok_str}, - {"prob", p.prob}, + {"id", p.tok}, + {"token", txt}, + {"bytes", str_to_bytes(p.txt)}, + { + post_sampling_probs ? "prob" : "logprob", + post_sampling_probs ? p.prob : logarithm(p.prob) + }, }); } return probs_for_token; } - static json probs_vector_to_json(const std::vector & probs) { + static json probs_vector_to_json(const std::vector & probs, bool post_sampling_probs) { json out = json::array(); - for (const auto & prob : probs) { - const std::string tok_str = prob.text_to_send; + for (const auto & p : probs) { + std::string txt(p.text_to_send); + txt.resize(validate_utf8(txt)); out.push_back(json { - {"content", tok_str}, - {"probs", prob.to_json()}, + {"id", p.tok}, + {"token", txt}, + {"bytes", str_to_bytes(p.text_to_send)}, + { + post_sampling_probs ? "prob" : "logprob", + post_sampling_probs ? p.prob : logarithm(p.prob) + }, + { + post_sampling_probs ? "top_probs" : "top_logprobs", + p.to_json(post_sampling_probs) + }, }); } return out; } + + static float logarithm(float x) { + // nlohmann::json converts -inf to null, so we need to prevent that + return x == 0.0f ? std::numeric_limits::lowest() : std::log(x); + } + + static std::vector str_to_bytes(const std::string & str) { + std::vector bytes; + for (unsigned char c : str) { + bytes.push_back(c); + } + return bytes; + } }; struct server_task_result_cmpl_final : server_task_result { int index = 0; - std::string content; + + std::string content; + llama_tokens tokens; + bool stream; result_timings timings; std::string prompt; @@ -459,11 +518,13 @@ struct server_task_result_cmpl_final : server_task_result { int32_t n_decoded; int32_t n_prompt_tokens; int32_t n_tokens_cached; - int32_t has_new_line; + bool has_new_line; std::string stopping_word; stop_type stop = STOP_TYPE_NONE; + bool post_sampling_probs; std::vector probs_output; + std::vector response_fields; slot_params generation_params; @@ -492,6 +553,7 @@ struct server_task_result_cmpl_final : server_task_result { json res = json { {"index", index}, {"content", stream ? "" : content}, // in stream mode, content is already in last partial chunk + {"tokens", stream ? llama_tokens {} : tokens}, {"id_slot", id_slot}, {"stop", true}, {"model", oaicompat_model}, @@ -506,10 +568,10 @@ struct server_task_result_cmpl_final : server_task_result { {"tokens_cached", n_tokens_cached}, {"timings", timings.to_json()}, }; - if (!probs_output.empty()) { - res["completion_probabilities"] = completion_token_output::probs_vector_to_json(probs_output); + if (!stream && !probs_output.empty()) { + res["completion_probabilities"] = completion_token_output::probs_vector_to_json(probs_output, post_sampling_probs); } - return res; + return response_fields.empty() ? res : json_get_nested_values(response_fields, res); } json to_json_oaicompat_chat() { @@ -518,22 +580,29 @@ struct server_task_result_cmpl_final : server_task_result { finish_reason = "stop"; } - json choices = json::array({json{ + json choice = json{ {"finish_reason", finish_reason}, {"index", 0}, - {"message", json{ + {"message", json { {"content", content}, - {"role", "assistant"} + {"role", "assistant"} } - }}}); + }}; + + if (!stream && probs_output.size() > 0) { + choice["logprobs"] = json{ + {"content", completion_token_output::probs_vector_to_json(probs_output, post_sampling_probs)}, + }; + } std::time_t t = std::time(0); json res = json { - {"choices", choices}, - {"created", t}, - {"model", oaicompat_model}, - {"object", "chat.completion"}, + {"choices", json::array({choice})}, + {"created", t}, + {"model", oaicompat_model}, + {"system_fingerprint", build_info}, + {"object", "chat.completion"}, {"usage", json { {"completion_tokens", n_decoded}, {"prompt_tokens", n_prompt_tokens}, @@ -560,16 +629,19 @@ struct server_task_result_cmpl_final : server_task_result { finish_reason = "stop"; } - json choices = json::array({json{{"finish_reason", finish_reason}, - {"index", 0}, - {"delta", json::object()}}}); + json choice = json{ + {"finish_reason", finish_reason}, + {"index", 0}, + {"delta", json::object()} + }; json ret = json { - {"choices", choices}, - {"created", t}, - {"id", oaicompat_cmpl_id}, - {"model", oaicompat_model}, - {"object", "chat.completion.chunk"}, + {"choices", json::array({choice})}, + {"created", t}, + {"id", oaicompat_cmpl_id}, + {"model", oaicompat_model}, + {"system_fingerprint", build_info}, + {"object", "chat.completion.chunk"}, {"usage", json { {"completion_tokens", n_decoded}, {"prompt_tokens", n_prompt_tokens}, @@ -587,12 +659,15 @@ struct server_task_result_cmpl_final : server_task_result { struct server_task_result_cmpl_partial : server_task_result { int index = 0; - std::string content; + + std::string content; + llama_tokens tokens; int32_t n_decoded; int32_t n_prompt_tokens; - std::vector probs_output; + bool post_sampling_probs; + completion_token_output prob_output; result_timings timings; // OAI-compat fields @@ -619,6 +694,7 @@ struct server_task_result_cmpl_partial : server_task_result { json res = json { {"index", index}, {"content", content}, + {"tokens", tokens}, {"stop", false}, {"id_slot", id_slot}, {"tokens_predicted", n_decoded}, @@ -628,8 +704,8 @@ struct server_task_result_cmpl_partial : server_task_result { if (timings.prompt_n > 0) { res.push_back({"timings", timings.to_json()}); } - if (!probs_output.empty()) { - res["completion_probabilities"] = completion_token_output::probs_vector_to_json(probs_output); + if (!prob_output.probs.empty()) { + res["completion_probabilities"] = completion_token_output::probs_vector_to_json({prob_output}, post_sampling_probs); } return res; } @@ -660,7 +736,7 @@ struct server_task_result_cmpl_partial : server_task_result { json second_ret = json{ {"choices", json::array({json{{"finish_reason", nullptr}, {"index", 0}, - {"delta", json{ + {"delta", json { {"content", content}}} }})}, {"created", t}, @@ -675,18 +751,27 @@ struct server_task_result_cmpl_partial : server_task_result { {"finish_reason", nullptr}, {"index", 0}, {"delta", - json{ + json { {"content", content}, }}, }}); } + GGML_ASSERT(choices.size() >= 1); + + if (prob_output.probs.size() > 0) { + choices[0]["logprobs"] = json{ + {"content", completion_token_output::probs_vector_to_json({prob_output}, post_sampling_probs)}, + }; + } + json ret = json { - {"choices", choices}, - {"created", t}, - {"id", oaicompat_cmpl_id}, - {"model", oaicompat_model}, - {"object", "chat.completion.chunk"} + {"choices", choices}, + {"created", t}, + {"id", oaicompat_cmpl_id}, + {"model", oaicompat_model}, + {"system_fingerprint", build_info}, + {"object", "chat.completion.chunk"} }; if (timings.prompt_n >= 0) { @@ -699,32 +784,52 @@ struct server_task_result_cmpl_partial : server_task_result { struct server_task_result_embd : server_task_result { int index = 0; - std::vector embedding; + std::vector> embedding; + + int32_t n_tokens; + + // OAI-compat fields + bool oaicompat = false; virtual int get_index() override { return index; } virtual json to_json() override { + return oaicompat ? to_json_oaicompat() : to_json_non_oaicompat(); + } + + json to_json_non_oaicompat() { return json { {"index", index}, {"embedding", embedding}, }; } + + json to_json_oaicompat() { + return json { + {"index", index}, + {"embedding", embedding[0]}, + {"tokens_evaluated", n_tokens}, + }; + } }; struct server_task_result_rerank : server_task_result { int index = 0; float score = -1e6; + int32_t n_tokens; + virtual int get_index() override { return index; } virtual json to_json() override { return json { - {"index", index}, - {"score", score}, + {"index", index}, + {"score", score}, + {"tokens_evaluated", n_tokens}, }; } }; @@ -931,8 +1036,11 @@ struct server_slot { size_t last_nl_pos = 0; - std::string generated_text; + std::string generated_text; + llama_tokens generated_tokens; + llama_tokens cache_tokens; + std::vector generated_token_probs; bool has_next_token = true; @@ -951,7 +1059,6 @@ struct server_slot { // stats size_t n_sent_text = 0; // number of sent text character - size_t n_sent_token_probs = 0; int64_t t_start_process_prompt; int64_t t_start_generation; @@ -973,9 +1080,9 @@ struct server_slot { stopping_word = ""; n_past = 0; n_sent_text = 0; - n_sent_token_probs = 0; task_type = SERVER_TASK_TYPE_COMPLETION; + generated_tokens.clear(); generated_token_probs.clear(); } @@ -1079,9 +1186,9 @@ struct server_slot { SLT_INF(*this, "\n" - "\rprompt eval time = %10.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)\n" - "\r eval time = %10.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)\n" - "\r total time = %10.2f ms / %5d tokens\n", + "prompt eval time = %10.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)\n" + " eval time = %10.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)\n" + " total time = %10.2f ms / %5d tokens\n", t_prompt_processing, n_prompt_tokens_processed, t_prompt, n_prompt_second, t_token_generation, n_decoded, t_gen, n_gen_second, t_prompt_processing + t_token_generation, n_prompt_tokens_processed + n_decoded); @@ -1469,7 +1576,7 @@ struct server_context { n_ctx = llama_n_ctx(ctx); add_bos_token = llama_add_bos_token(model); - has_eos_token = !llama_add_eos_token(model); + has_eos_token = llama_token_eos(model) != LLAMA_TOKEN_NULL; if (!params_base.speculative.model.empty()) { SRV_INF("loading draft model '%s'\n", params_base.speculative.model.c_str()); @@ -1713,35 +1820,19 @@ struct server_context { bool process_token(completion_token_output & result, server_slot & slot) { // remember which tokens were sampled - used for repetition penalties during sampling - const std::string token_str = common_token_to_piece(ctx, result.tok, params_base.special); + const std::string token_str = result.text_to_send; slot.sampled = result.tok; - // search stop word and delete it slot.generated_text += token_str; + if (slot.params.return_tokens) { + slot.generated_tokens.push_back(result.tok); + } slot.has_next_token = true; // check if there is incomplete UTF-8 character at the end - bool incomplete = false; - for (unsigned i = 1; i < 5 && i <= slot.generated_text.size(); ++i) { - unsigned char c = slot.generated_text[slot.generated_text.size() - i]; - if ((c & 0xC0) == 0x80) { - // continuation byte: 10xxxxxx - continue; - } - if ((c & 0xE0) == 0xC0) { - // 2-byte character: 110xxxxx ... - incomplete = i < 2; - } else if ((c & 0xF0) == 0xE0) { - // 3-byte character: 1110xxxx ... - incomplete = i < 3; - } else if ((c & 0xF8) == 0xF0) { - // 4-byte character: 11110xxx ... - incomplete = i < 4; - } - // else 1-byte character or invalid byte - break; - } + bool incomplete = validate_utf8(slot.generated_text) < slot.generated_text.size(); + // search stop word and delete it if (!incomplete) { size_t pos = std::min(slot.n_sent_text, slot.generated_text.size()); @@ -1869,6 +1960,55 @@ struct server_context { return slot.has_next_token; // continue } + void populate_token_probs(const server_slot & slot, completion_token_output & result, bool post_sampling, bool special, int idx) { + size_t n_probs = slot.params.sampling.n_probs; + size_t n_vocab = llama_n_vocab(llama_get_model(ctx)); + if (post_sampling) { + const auto * cur_p = common_sampler_get_candidates(slot.smpl); + const size_t max_probs = cur_p->size; + + // set probability for sampled token + for (size_t i = 0; i < max_probs; i++) { + if (cur_p->data[i].id == result.tok) { + result.prob = cur_p->data[i].p; + break; + } + } + + // set probability for top n_probs tokens + result.probs.reserve(max_probs); + for (size_t i = 0; i < std::min(max_probs, n_probs); i++) { + result.probs.push_back({ + cur_p->data[i].id, + common_detokenize(ctx, {cur_p->data[i].id}, special), + cur_p->data[i].p + }); + } + } else { + // TODO: optimize this with min-p optimization + std::vector cur = get_token_probabilities(ctx, idx); + + // set probability for sampled token + for (size_t i = 0; i < n_vocab; i++) { + // set probability for sampled token + if (cur[i].id == result.tok) { + result.prob = cur[i].p; + break; + } + } + + // set probability for top n_probs tokens + result.probs.reserve(n_probs); + for (size_t i = 0; i < std::min(n_vocab, n_probs); i++) { + result.probs.push_back({ + cur[i].id, + common_detokenize(ctx, {cur[i].id}, special), + cur[i].p + }); + } + } + } + void send_error(const server_task & task, const std::string & error, const enum error_type type = ERROR_TYPE_SERVER) { send_error(task.id, error, type); } @@ -1894,9 +2034,11 @@ struct server_context { res->id = slot.id_task; res->index = slot.index; res->content = tkn.text_to_send; + res->tokens = { tkn.tok }; - res->n_decoded = slot.n_decoded; - res->n_prompt_tokens = slot.n_prompt_tokens; + res->n_decoded = slot.n_decoded; + res->n_prompt_tokens = slot.n_prompt_tokens; + res->post_sampling_probs = slot.params.post_sampling_probs; res->verbose = slot.params.verbose; res->oaicompat = slot.params.oaicompat; @@ -1906,17 +2048,7 @@ struct server_context { // populate res.probs_output if (slot.params.sampling.n_probs > 0) { - const llama_tokens to_send_toks = common_tokenize(ctx, tkn.text_to_send, false); - - const size_t probs_pos = std::min(slot.n_sent_token_probs, slot.generated_token_probs.size()); - const size_t probs_stop_pos = std::min(slot.n_sent_token_probs + to_send_toks.size(), slot.generated_token_probs.size()); - - std::vector probs_output; - if (probs_pos < probs_stop_pos) { - res->probs_output = std::vector( - slot.generated_token_probs.begin() + probs_pos, - slot.generated_token_probs.begin() + probs_stop_pos); - } + res->prob_output = tkn; // copy the token probs } // populate timings if this is final response or timings_per_token is enabled @@ -1934,16 +2066,19 @@ struct server_context { res->index = slot.index; res->content = slot.generated_text; + res->tokens = slot.generated_tokens; res->timings = slot.get_timings(); res->prompt = common_detokenize(ctx, slot.prompt_tokens, true); + res->response_fields = slot.params.response_fields; - res->truncated = slot.truncated; - res->n_decoded = slot.n_decoded; - res->n_prompt_tokens = slot.n_prompt_tokens; - res->n_tokens_cached = slot.n_past; - res->has_new_line = slot.has_new_line; - res->stopping_word = slot.stopping_word; - res->stop = slot.stop; + res->truncated = slot.truncated; + res->n_decoded = slot.n_decoded; + res->n_prompt_tokens = slot.n_prompt_tokens; + res->n_tokens_cached = slot.n_past; + res->has_new_line = slot.has_new_line; + res->stopping_word = slot.stopping_word; + res->stop = slot.stop; + res->post_sampling_probs = slot.params.post_sampling_probs; res->verbose = slot.params.verbose; res->stream = slot.params.stream; @@ -1975,8 +2110,10 @@ struct server_context { void send_embedding(const server_slot & slot, const llama_batch & batch) { auto res = std::make_unique(); - res->id = slot.id_task; - res->index = slot.index; + res->id = slot.id_task; + res->index = slot.index; + res->n_tokens = slot.n_prompt_tokens; + res->oaicompat = slot.params.oaicompat; const int n_embd = llama_n_embd(model); @@ -1995,12 +2132,18 @@ struct server_context { if (embd == NULL) { SLT_ERR(slot, "failed to get embeddings, token = %d, seq_id = %d\n", batch.token[i], batch.seq_id[i][0]); - res->embedding = std::vector(n_embd, 0.0f); + res->embedding.push_back(std::vector(n_embd, 0.0f)); continue; } - common_embd_normalize(embd, embd_res.data(), n_embd); - res->embedding = embd_res; + // normalize only when there is pooling + // TODO: configurable + if (llama_pooling_type(slot.ctx) != LLAMA_POOLING_TYPE_NONE) { + common_embd_normalize(embd, embd_res.data(), n_embd, 2); + res->embedding.push_back(embd_res); + } else { + res->embedding.push_back({ embd, embd + n_embd }); + } } SLT_DBG(slot, "%s", "sending embeddings\n"); @@ -2012,6 +2155,7 @@ struct server_context { auto res = std::make_unique(); res->id = slot.id_task; res->index = slot.index; + res->n_tokens = slot.n_prompt_tokens; for (int i = 0; i < batch.n_tokens; ++i) { if (!batch.logits[i] || batch.seq_id[i][0] != slot.id) { @@ -2613,7 +2757,10 @@ struct server_context { // add prompt tokens for processing in the current batch while (slot.n_past < slot.n_prompt_tokens && batch.n_tokens < n_batch) { - common_batch_add(batch, prompt_tokens[slot.n_past], slot.n_past, { slot.id }, false); + // without pooling, we want to output the embeddings for all the tokens in the batch + const bool need_embd = slot.task_type == SERVER_TASK_TYPE_EMBEDDING && llama_pooling_type(slot.ctx) == LLAMA_POOLING_TYPE_NONE; + + common_batch_add(batch, prompt_tokens[slot.n_past], slot.n_past, { slot.id }, need_embd); if (slot.params.cache_prompt) { slot.cache_tokens.push_back(prompt_tokens[slot.n_past]); @@ -2728,7 +2875,9 @@ struct server_context { continue; // continue loop of slots } - llama_token id = common_sampler_sample(slot.smpl, ctx, slot.i_batch - i); + const int tok_idx = slot.i_batch - i; + + llama_token id = common_sampler_sample(slot.smpl, ctx, tok_idx); slot.i_batch = -1; @@ -2747,17 +2896,12 @@ struct server_context { slot.t_token_generation = (t_current - slot.t_start_generation) / 1e3; completion_token_output result; - result.tok = id; - - const auto * cur_p = common_sampler_get_candidates(slot.smpl); + result.tok = id; + result.text_to_send = common_token_to_piece(ctx, result.tok, params_base.special); + result.prob = 1.0f; // TODO: set it here instead of doing inside populate_token_probs - for (size_t i = 0; i < (size_t) slot.params.sampling.n_probs; ++i) { - auto tok_id = cur_p->data[i].id; - result.probs.push_back({ - tok_id, - tokens_to_output_formatted_string(ctx, tok_id), - i >= cur_p->size ? 0.0f : cur_p->data[i].p, - }); + if (slot.params.sampling.n_probs > 0) { + populate_token_probs(slot, result, slot.params.post_sampling_probs, params_base.special, tok_idx); } if (!process_token(result, slot)) { @@ -2841,7 +2985,11 @@ struct server_context { for (size_t i = 0; i < ids.size(); ++i) { completion_token_output result; - result.tok = ids[i]; + result.tok = ids[i]; + result.text_to_send = common_token_to_piece(ctx, result.tok, params_base.special); + result.prob = 1.0f; // set later + + // TODO: set result.probs if (!process_token(result, slot)) { // release slot because of stop condition @@ -3335,6 +3483,7 @@ int main(int argc, char ** argv) { { "total_slots", ctx_server.params_base.n_parallel }, { "model_path", ctx_server.params_base.model }, { "chat_template", llama_get_chat_template(ctx_server.model) }, + { "build_info", build_info }, }; res_ok(res, data); @@ -3381,7 +3530,7 @@ int main(int argc, char ** argv) { task.index = i; task.prompt_tokens = std::move(tokenized_prompts[i]); - task.params = server_task::params_from_json_cmpl(ctx_server.model, ctx_server.params_base, data); + task.params = server_task::params_from_json_cmpl(ctx_server.model, ctx_server.ctx, ctx_server.params_base, data); task.id_selected_slot = json_value(data, "id_slot", -1); // OAI-compat @@ -3556,7 +3705,7 @@ int main(int argc, char ** argv) { {"object", "list"}, {"data", { { - {"id", params.model_alias}, + {"id", params.model_alias.empty() ? params.model : params.model_alias}, {"object", "model"}, {"created", std::time(0)}, {"owned_by", "llamacpp"}, @@ -3621,34 +3770,61 @@ int main(int argc, char ** argv) { res_ok(res, data); }; - const auto handle_embeddings = [&ctx_server, &res_error, &res_ok](const httplib::Request & req, httplib::Response & res) { + const auto handle_embeddings_impl = [&ctx_server, &res_error, &res_ok](const httplib::Request & req, httplib::Response & res, bool oaicompat) { const json body = json::parse(req.body); - bool oaicompat = false; - // an input prompt can be a string or a list of tokens (integer) + if (oaicompat && llama_pooling_type(ctx_server.ctx) == LLAMA_POOLING_TYPE_NONE) { + res_error(res, format_error_response("Pooling type 'none' is not OAI compatible. Please use a different pooling type", ERROR_TYPE_INVALID_REQUEST)); + return; + } + + // for the shape of input/content, see tokenize_input_prompts() json prompt; if (body.count("input") != 0) { - oaicompat = true; prompt = body.at("input"); - } else if (body.count("content") != 0) { - // with "content", we only support single prompt - prompt = std::vector{body.at("content")}; + } else if (body.contains("content")) { + oaicompat = false; + prompt = body.at("content"); } else { res_error(res, format_error_response("\"input\" or \"content\" must be provided", ERROR_TYPE_INVALID_REQUEST)); return; } + bool use_base64 = false; + if (body.count("encoding_format") != 0) { + const std::string& format = body.at("encoding_format"); + if (format == "base64") { + use_base64 = true; + } else if (format != "float") { + res_error(res, format_error_response("The format to return the embeddings in. Can be either float or base64", ERROR_TYPE_INVALID_REQUEST)); + return; + } + } + + std::vector tokenized_prompts = tokenize_input_prompts(ctx_server.ctx, prompt, true, true); + for (const auto & tokens : tokenized_prompts) { + // this check is necessary for models that do not add BOS token to the input + if (tokens.empty()) { + res_error(res, format_error_response("Input content cannot be empty", ERROR_TYPE_INVALID_REQUEST)); + return; + } + } + // create and queue the task json responses = json::array(); bool error = false; { std::vector tasks; - std::vector tokenized_prompts = tokenize_input_prompts(ctx_server.ctx, prompt, /* add_special */ false, true); for (size_t i = 0; i < tokenized_prompts.size(); i++) { - server_task task = server_task(SERVER_TASK_TYPE_EMBEDDING); + server_task task = server_task(SERVER_TASK_TYPE_EMBEDDING); + task.id = ctx_server.queue_tasks.get_new_id(); task.index = i; task.prompt_tokens = std::move(tokenized_prompts[i]); + + // OAI-compat + task.params.oaicompat = oaicompat; + tasks.push_back(task); } @@ -3676,12 +3852,18 @@ int main(int argc, char ** argv) { } // write JSON response - json root = oaicompat - ? format_embeddings_response_oaicompat(body, responses) - : responses.size() == 1 ? responses[0] : json(responses); + json root = oaicompat ? format_embeddings_response_oaicompat(body, responses, use_base64) : json(responses); res_ok(res, root); }; + const auto handle_embeddings = [&handle_embeddings_impl](const httplib::Request & req, httplib::Response & res) { + handle_embeddings_impl(req, res, false); + }; + + const auto handle_embeddings_oai = [&handle_embeddings_impl](const httplib::Request & req, httplib::Response & res) { + handle_embeddings_impl(req, res, true); + }; + const auto handle_rerank = [&ctx_server, &res_error, &res_ok](const httplib::Request & req, httplib::Response & res) { if (!ctx_server.params_base.reranking || ctx_server.params_base.embedding) { res_error(res, format_error_response("This server does not support reranking. Start it with `--reranking` and without `--embedding`", ERROR_TYPE_NOT_SUPPORTED)); @@ -3828,8 +4010,13 @@ int main(int argc, char ** argv) { } } else { // using embedded static index.html - svr->Get("/", [](const httplib::Request &, httplib::Response & res) { - res.set_content(reinterpret_cast(index_html), index_html_len, "text/html; charset=utf-8"); + svr->Get("/", [](const httplib::Request & req, httplib::Response & res) { + if (req.get_header_value("Accept-Encoding").find("gzip") == std::string::npos) { + res.set_content("Error: gzip is not supported by this browser", "text/plain"); + } else { + res.set_header("Content-Encoding", "gzip"); + res.set_content(reinterpret_cast(index_html_gz), index_html_gz_len, "text/html; charset=utf-8"); + } return false; }); } @@ -3850,7 +4037,7 @@ int main(int argc, char ** argv) { svr->Post("/infill", handle_infill); svr->Post("/embedding", handle_embeddings); // legacy svr->Post("/embeddings", handle_embeddings); - svr->Post("/v1/embeddings", handle_embeddings); + svr->Post("/v1/embeddings", handle_embeddings_oai); svr->Post("/rerank", handle_rerank); svr->Post("/reranking", handle_rerank); svr->Post("/v1/rerank", handle_rerank); diff --git a/examples/server/tests/unit/test_chat_completion.py b/examples/server/tests/unit/test_chat_completion.py index 6573cc17f7b..88549708113 100644 --- a/examples/server/tests/unit/test_chat_completion.py +++ b/examples/server/tests/unit/test_chat_completion.py @@ -31,6 +31,7 @@ def test_chat_completion(model, system_prompt, user_prompt, max_tokens, re_conte }) assert res.status_code == 200 assert "cmpl" in res.body["id"] # make sure the completion id has the expected format + assert res.body["system_fingerprint"].startswith("b") assert res.body["model"] == model if model is not None else server.model_alias assert res.body["usage"]["prompt_tokens"] == n_prompt assert res.body["usage"]["completion_tokens"] == n_predicted @@ -63,6 +64,7 @@ def test_chat_completion_stream(system_prompt, user_prompt, max_tokens, re_conte last_cmpl_id = None for data in res: choice = data["choices"][0] + assert data["system_fingerprint"].startswith("b") assert "gpt-3.5" in data["model"] # DEFAULT_OAICOMPAT_MODEL, maybe changed in the future if last_cmpl_id is None: last_cmpl_id = data["id"] @@ -92,7 +94,7 @@ def test_chat_completion_with_openai_library(): seed=42, temperature=0.8, ) - print(res) + assert res.system_fingerprint is not None and res.system_fingerprint.startswith("b") assert res.choices[0].finish_reason == "length" assert res.choices[0].message.content is not None assert match_regex("(Suddenly)+", res.choices[0].message.content) @@ -163,3 +165,64 @@ def test_chat_completion_with_timings_per_token(): assert "predicted_per_second" in data["timings"] assert "predicted_n" in data["timings"] assert data["timings"]["predicted_n"] <= 10 + + +def test_logprobs(): + global server + server.start() + client = OpenAI(api_key="dummy", base_url=f"http://{server.server_host}:{server.server_port}") + res = client.chat.completions.create( + model="gpt-3.5-turbo-instruct", + temperature=0.0, + messages=[ + {"role": "system", "content": "Book"}, + {"role": "user", "content": "What is the best book"}, + ], + max_tokens=5, + logprobs=True, + top_logprobs=10, + ) + output_text = res.choices[0].message.content + aggregated_text = '' + assert res.choices[0].logprobs is not None + assert res.choices[0].logprobs.content is not None + for token in res.choices[0].logprobs.content: + aggregated_text += token.token + assert token.logprob <= 0.0 + assert token.bytes is not None + assert len(token.top_logprobs) > 0 + assert aggregated_text == output_text + + +def test_logprobs_stream(): + global server + server.start() + client = OpenAI(api_key="dummy", base_url=f"http://{server.server_host}:{server.server_port}") + res = client.chat.completions.create( + model="gpt-3.5-turbo-instruct", + temperature=0.0, + messages=[ + {"role": "system", "content": "Book"}, + {"role": "user", "content": "What is the best book"}, + ], + max_tokens=5, + logprobs=True, + top_logprobs=10, + stream=True, + ) + output_text = '' + aggregated_text = '' + for data in res: + choice = data.choices[0] + if choice.finish_reason is None: + if choice.delta.content: + output_text += choice.delta.content + assert choice.logprobs is not None + assert choice.logprobs.content is not None + for token in choice.logprobs.content: + aggregated_text += token.token + assert token.logprob <= 0.0 + assert token.bytes is not None + assert token.top_logprobs is not None + assert len(token.top_logprobs) > 0 + assert aggregated_text == output_text diff --git a/examples/server/tests/unit/test_completion.py b/examples/server/tests/unit/test_completion.py index 7f4f9cd038b..a6b21594466 100644 --- a/examples/server/tests/unit/test_completion.py +++ b/examples/server/tests/unit/test_completion.py @@ -10,22 +10,29 @@ def create_server(): global server server = ServerPreset.tinyllama2() -@pytest.mark.parametrize("prompt,n_predict,re_content,n_prompt,n_predicted,truncated", [ - ("I believe the meaning of life is", 8, "(going|bed)+", 18, 8, False), - ("Write a joke about AI from a very long prompt which will not be truncated", 256, "(princesses|everyone|kids|Anna|forest)+", 46, 64, False), +@pytest.mark.parametrize("prompt,n_predict,re_content,n_prompt,n_predicted,truncated,return_tokens", [ + ("I believe the meaning of life is", 8, "(going|bed)+", 18, 8, False, False), + ("Write a joke about AI from a very long prompt which will not be truncated", 256, "(princesses|everyone|kids|Anna|forest)+", 46, 64, False, True), ]) -def test_completion(prompt: str, n_predict: int, re_content: str, n_prompt: int, n_predicted: int, truncated: bool): +def test_completion(prompt: str, n_predict: int, re_content: str, n_prompt: int, n_predicted: int, truncated: bool, return_tokens: bool): global server server.start() res = server.make_request("POST", "/completion", data={ "n_predict": n_predict, "prompt": prompt, + "return_tokens": return_tokens, }) assert res.status_code == 200 assert res.body["timings"]["prompt_n"] == n_prompt assert res.body["timings"]["predicted_n"] == n_predicted assert res.body["truncated"] == truncated + assert type(res.body["has_new_line"]) == bool assert match_regex(re_content, res.body["content"]) + if return_tokens: + assert len(res.body["tokens"]) > 0 + assert all(type(tok) == int for tok in res.body["tokens"]) + else: + assert res.body["tokens"] == [] @pytest.mark.parametrize("prompt,n_predict,re_content,n_prompt,n_predicted,truncated", [ @@ -48,12 +55,15 @@ def test_completion_stream(prompt: str, n_predict: int, re_content: str, n_promp assert data["timings"]["predicted_n"] == n_predicted assert data["truncated"] == truncated assert data["stop_type"] == "limit" + assert type(data["has_new_line"]) == bool assert "generation_settings" in data assert server.n_predict is not None assert data["generation_settings"]["n_predict"] == min(n_predict, server.n_predict) assert data["generation_settings"]["seed"] == server.seed assert match_regex(re_content, content) else: + assert len(data["tokens"]) > 0 + assert all(type(tok) == int for tok in data["tokens"]) content += data["content"] @@ -85,7 +95,7 @@ def test_consistent_result_same_seed(n_slots: int): res = server.make_request("POST", "/completion", data={ "prompt": "I believe the meaning of life is", "seed": 42, - "temperature": 1.0, + "temperature": 0.0, "cache_prompt": False, # TODO: remove this once test_cache_vs_nocache_prompt is fixed }) if last_res is not None: @@ -110,9 +120,10 @@ def test_different_result_different_seed(n_slots: int): assert res.body["content"] != last_res.body["content"] last_res = res - +# TODO figure why it don't work with temperature = 1 +# @pytest.mark.parametrize("temperature", [0.0, 1.0]) @pytest.mark.parametrize("n_batch", [16, 32]) -@pytest.mark.parametrize("temperature", [0.0, 1.0]) +@pytest.mark.parametrize("temperature", [0.0]) def test_consistent_result_different_batch_size(n_batch: int, temperature: float): global server server.n_batch = n_batch @@ -247,6 +258,40 @@ def check_slots_status(): # assert match_regex(re_content, res.body["content"]) +@pytest.mark.parametrize( + "prompt,n_predict,response_fields", + [ + ("I believe the meaning of life is", 8, []), + ("I believe the meaning of life is", 32, ["content", "generation_settings/n_predict", "prompt"]), + ], +) +def test_completion_response_fields( + prompt: str, n_predict: int, response_fields: list[str] +): + global server + server.start() + res = server.make_request( + "POST", + "/completion", + data={ + "n_predict": n_predict, + "prompt": prompt, + "response_fields": response_fields, + }, + ) + assert res.status_code == 200 + assert "content" in res.body + assert len(res.body["content"]) + if len(response_fields): + assert res.body["generation_settings/n_predict"] == n_predict + assert res.body["prompt"] == " " + prompt + assert isinstance(res.body["content"], str) + assert len(res.body) == len(response_fields) + else: + assert len(res.body) + assert "generation_settings" in res.body + + def test_n_probs(): global server server.start() @@ -260,9 +305,68 @@ def test_n_probs(): assert "completion_probabilities" in res.body assert len(res.body["completion_probabilities"]) == 5 for tok in res.body["completion_probabilities"]: - assert "probs" in tok - assert len(tok["probs"]) == 10 - for prob in tok["probs"]: - assert "prob" in prob - assert "tok_str" in prob - assert 0.0 <= prob["prob"] <= 1.0 + assert "id" in tok and tok["id"] > 0 + assert "token" in tok and type(tok["token"]) == str + assert "logprob" in tok and tok["logprob"] <= 0.0 + assert "bytes" in tok and type(tok["bytes"]) == list + assert len(tok["top_logprobs"]) == 10 + for prob in tok["top_logprobs"]: + assert "id" in prob and prob["id"] > 0 + assert "token" in prob and type(prob["token"]) == str + assert "logprob" in prob and prob["logprob"] <= 0.0 + assert "bytes" in prob and type(prob["bytes"]) == list + + +def test_n_probs_stream(): + global server + server.start() + res = server.make_stream_request("POST", "/completion", data={ + "prompt": "I believe the meaning of life is", + "n_probs": 10, + "temperature": 0.0, + "n_predict": 5, + "stream": True, + }) + for data in res: + if data["stop"] == False: + assert "completion_probabilities" in data + assert len(data["completion_probabilities"]) == 1 + for tok in data["completion_probabilities"]: + assert "id" in tok and tok["id"] > 0 + assert "token" in tok and type(tok["token"]) == str + assert "logprob" in tok and tok["logprob"] <= 0.0 + assert "bytes" in tok and type(tok["bytes"]) == list + assert len(tok["top_logprobs"]) == 10 + for prob in tok["top_logprobs"]: + assert "id" in prob and prob["id"] > 0 + assert "token" in prob and type(prob["token"]) == str + assert "logprob" in prob and prob["logprob"] <= 0.0 + assert "bytes" in prob and type(prob["bytes"]) == list + + +def test_n_probs_post_sampling(): + global server + server.start() + res = server.make_request("POST", "/completion", data={ + "prompt": "I believe the meaning of life is", + "n_probs": 10, + "temperature": 0.0, + "n_predict": 5, + "post_sampling_probs": True, + }) + assert res.status_code == 200 + assert "completion_probabilities" in res.body + assert len(res.body["completion_probabilities"]) == 5 + for tok in res.body["completion_probabilities"]: + assert "id" in tok and tok["id"] > 0 + assert "token" in tok and type(tok["token"]) == str + assert "prob" in tok and 0.0 < tok["prob"] <= 1.0 + assert "bytes" in tok and type(tok["bytes"]) == list + assert len(tok["top_probs"]) == 10 + for prob in tok["top_probs"]: + assert "id" in prob and prob["id"] > 0 + assert "token" in prob and type(prob["token"]) == str + assert "prob" in prob and 0.0 <= prob["prob"] <= 1.0 + assert "bytes" in prob and type(prob["bytes"]) == list + # because the test model usually output token with either 100% or 0% probability, we need to check all the top_probs + assert any(prob["prob"] == 1.0 for prob in tok["top_probs"]) diff --git a/examples/server/tests/unit/test_embedding.py b/examples/server/tests/unit/test_embedding.py index fc7c20064dd..8b0eb42b092 100644 --- a/examples/server/tests/unit/test_embedding.py +++ b/examples/server/tests/unit/test_embedding.py @@ -1,3 +1,5 @@ +import base64 +import struct import pytest from openai import OpenAI from utils import * @@ -14,8 +16,9 @@ def create_server(): def test_embedding_single(): global server + server.pooling = 'last' server.start() - res = server.make_request("POST", "/embeddings", data={ + res = server.make_request("POST", "/v1/embeddings", data={ "input": "I believe the meaning of life is", }) assert res.status_code == 200 @@ -29,8 +32,9 @@ def test_embedding_single(): def test_embedding_multiple(): global server + server.pooling = 'last' server.start() - res = server.make_request("POST", "/embeddings", data={ + res = server.make_request("POST", "/v1/embeddings", data={ "input": [ "I believe the meaning of life is", "Write a joke about AI from a very long prompt which will not be truncated", @@ -45,10 +49,72 @@ def test_embedding_multiple(): assert len(d['embedding']) > 1 +@pytest.mark.parametrize( + "input,is_multi_prompt", + [ + # do not crash on empty input + ("", False), + # single prompt + ("string", False), + ([12, 34, 56], False), + ([12, 34, "string", 56, 78], False), + # multiple prompts + (["string1", "string2"], True), + (["string1", [12, 34, 56]], True), + ([[12, 34, 56], [12, 34, 56]], True), + ([[12, 34, 56], [12, "string", 34, 56]], True), + ] +) +def test_embedding_mixed_input(input, is_multi_prompt: bool): + global server + server.start() + res = server.make_request("POST", "/v1/embeddings", data={"input": input}) + assert res.status_code == 200 + data = res.body['data'] + if is_multi_prompt: + assert len(data) == len(input) + for d in data: + assert 'embedding' in d + assert len(d['embedding']) > 1 + else: + assert 'embedding' in data[0] + assert len(data[0]['embedding']) > 1 + + +def test_embedding_pooling_none(): + global server + server.pooling = 'none' + server.start() + res = server.make_request("POST", "/embeddings", data={ + "input": "hello hello hello", + }) + assert res.status_code == 200 + assert 'embedding' in res.body[0] + assert len(res.body[0]['embedding']) == 5 # 3 text tokens + 2 special + + # make sure embedding vector is not normalized + for x in res.body[0]['embedding']: + assert abs(sum([x ** 2 for x in x]) - 1) > EPSILON + + +def test_embedding_pooling_none_oai(): + global server + server.pooling = 'none' + server.start() + res = server.make_request("POST", "/v1/embeddings", data={ + "input": "hello hello hello", + }) + + # /v1/embeddings does not support pooling type 'none' + assert res.status_code == 400 + assert "error" in res.body + + def test_embedding_openai_library_single(): global server + server.pooling = 'last' server.start() - client = OpenAI(api_key="dummy", base_url=f"http://{server.server_host}:{server.server_port}") + client = OpenAI(api_key="dummy", base_url=f"http://{server.server_host}:{server.server_port}/v1") res = client.embeddings.create(model="text-embedding-3-small", input="I believe the meaning of life is") assert len(res.data) == 1 assert len(res.data[0].embedding) > 1 @@ -56,8 +122,9 @@ def test_embedding_openai_library_single(): def test_embedding_openai_library_multiple(): global server + server.pooling = 'last' server.start() - client = OpenAI(api_key="dummy", base_url=f"http://{server.server_host}:{server.server_port}") + client = OpenAI(api_key="dummy", base_url=f"http://{server.server_host}:{server.server_port}/v1") res = client.embeddings.create(model="text-embedding-3-small", input=[ "I believe the meaning of life is", "Write a joke about AI from a very long prompt which will not be truncated", @@ -71,8 +138,9 @@ def test_embedding_openai_library_multiple(): def test_embedding_error_prompt_too_long(): global server + server.pooling = 'last' server.start() - res = server.make_request("POST", "/embeddings", data={ + res = server.make_request("POST", "/v1/embeddings", data={ "input": "This is a test " * 512, }) assert res.status_code != 200 @@ -80,8 +148,9 @@ def test_embedding_error_prompt_too_long(): def test_same_prompt_give_same_result(): + server.pooling = 'last' server.start() - res = server.make_request("POST", "/embeddings", data={ + res = server.make_request("POST", "/v1/embeddings", data={ "input": [ "I believe the meaning of life is", "I believe the meaning of life is", @@ -97,3 +166,72 @@ def test_same_prompt_give_same_result(): vi = res.body['data'][i]['embedding'] for x, y in zip(v0, vi): assert abs(x - y) < EPSILON + + +@pytest.mark.parametrize( + "content,n_tokens", + [ + ("I believe the meaning of life is", 9), + ("This is a test", 6), + ] +) +def test_embedding_usage_single(content, n_tokens): + global server + server.start() + res = server.make_request("POST", "/v1/embeddings", data={"input": content}) + assert res.status_code == 200 + assert res.body['usage']['prompt_tokens'] == res.body['usage']['total_tokens'] + assert res.body['usage']['prompt_tokens'] == n_tokens + + +def test_embedding_usage_multiple(): + global server + server.start() + res = server.make_request("POST", "/v1/embeddings", data={ + "input": [ + "I believe the meaning of life is", + "I believe the meaning of life is", + ], + }) + assert res.status_code == 200 + assert res.body['usage']['prompt_tokens'] == res.body['usage']['total_tokens'] + assert res.body['usage']['prompt_tokens'] == 2 * 9 + + +def test_embedding_openai_library_base64(): + server.start() + test_input = "Test base64 embedding output" + + # get embedding in default format + res = server.make_request("POST", "/v1/embeddings", data={ + "input": test_input + }) + assert res.status_code == 200 + vec0 = res.body["data"][0]["embedding"] + + # get embedding in base64 format + res = server.make_request("POST", "/v1/embeddings", data={ + "input": test_input, + "encoding_format": "base64" + }) + + assert res.status_code == 200 + assert "data" in res.body + assert len(res.body["data"]) == 1 + + embedding_data = res.body["data"][0] + assert "embedding" in embedding_data + assert isinstance(embedding_data["embedding"], str) + + # Verify embedding is valid base64 + decoded = base64.b64decode(embedding_data["embedding"]) + # Verify decoded data can be converted back to float array + float_count = len(decoded) // 4 # 4 bytes per float + floats = struct.unpack(f'{float_count}f', decoded) + assert len(floats) > 0 + assert all(isinstance(x, float) for x in floats) + assert len(floats) == len(vec0) + + # make sure the decoded data is the same as the original + for x, y in zip(floats, vec0): + assert abs(x - y) < EPSILON diff --git a/examples/server/tests/unit/test_rerank.py b/examples/server/tests/unit/test_rerank.py index 189bc4c9623..7203d794357 100644 --- a/examples/server/tests/unit/test_rerank.py +++ b/examples/server/tests/unit/test_rerank.py @@ -53,3 +53,26 @@ def test_invalid_rerank_req(documents): }) assert res.status_code == 400 assert "error" in res.body + + +@pytest.mark.parametrize( + "query,doc1,doc2,n_tokens", + [ + ("Machine learning is", "A machine", "Learning is", 19), + ("Which city?", "Machine learning is ", "Paris, capitale de la", 26), + ] +) +def test_rerank_usage(query, doc1, doc2, n_tokens): + global server + server.start() + + res = server.make_request("POST", "/rerank", data={ + "query": query, + "documents": [ + doc1, + doc2, + ] + }) + assert res.status_code == 200 + assert res.body['usage']['prompt_tokens'] == res.body['usage']['total_tokens'] + assert res.body['usage']['prompt_tokens'] == n_tokens diff --git a/examples/server/tests/utils.py b/examples/server/tests/utils.py index d988ccf5e30..277125e88b5 100644 --- a/examples/server/tests/utils.py +++ b/examples/server/tests/utils.py @@ -65,6 +65,7 @@ class ServerProcess: server_reranking: bool | None = False server_metrics: bool | None = False server_slots: bool | None = False + pooling: str | None = None draft: int | None = None api_key: str | None = None response_format: str | None = None @@ -132,6 +133,8 @@ def start(self, timeout_seconds: int = 10) -> None: server_args.append("--metrics") if self.server_slots: server_args.append("--slots") + if self.pooling: + server_args.extend(["--pooling", self.pooling]) if self.model_alias: server_args.extend(["--alias", self.model_alias]) if self.n_ctx: diff --git a/examples/server/themes/buttons-top/index.html b/examples/server/themes/buttons-top/index.html index 2797c37c964..3fb88fcc88d 100644 --- a/examples/server/themes/buttons-top/index.html +++ b/examples/server/themes/buttons-top/index.html @@ -222,7 +222,6 @@ temperature: 0.7, repeat_last_n: 256, // 0 = disable penalty, -1 = context size repeat_penalty: 1.18, // 1.0 = disabled - penalize_nl: false, top_k: 40, // <= 0 to use vocab size top_p: 0.95, // 1.0 = disabled min_p: 0.05, // 0 = disabled @@ -779,7 +778,6 @@ ${FloatField({ label: "Temperature", max: 2.0, min: 0.0, name: "temperature", step: 0.01, value: params.value.temperature })} ${FloatField({ label: "Penalize repeat sequence", max: 2.0, min: 0.0, name: "repeat_penalty", step: 0.01, value: params.value.repeat_penalty })} ${IntField({ label: "Consider N tokens for penalize", max: 2048, min: 0, name: "repeat_last_n", value: params.value.repeat_last_n })} - ${BoolField({ label: "Penalize repetition of newlines", name: "penalize_nl", value: params.value.penalize_nl })} ${IntField({ label: "Top-K sampling", max: 100, min: -1, name: "top_k", value: params.value.top_k })} ${FloatField({ label: "Top-P sampling", max: 1.0, min: 0.0, name: "top_p", step: 0.01, value: params.value.top_p })} ${FloatField({ label: "Min-P sampling", max: 1.0, min: 0.0, name: "min_p", step: 0.01, value: params.value.min_p })} diff --git a/examples/server/themes/wild/index.html b/examples/server/themes/wild/index.html index dbe23c40241..73f36d4b29f 100644 --- a/examples/server/themes/wild/index.html +++ b/examples/server/themes/wild/index.html @@ -225,7 +225,6 @@ temperature: 0.7, repeat_last_n: 256, // 0 = disable penalty, -1 = context size repeat_penalty: 1.18, // 1.0 = disabled - penalize_nl: false, top_k: 40, // <= 0 to use vocab size top_p: 0.95, // 1.0 = disabled min_p: 0.05, // 0 = disabled @@ -782,7 +781,6 @@ ${FloatField({ label: "Temperature", max: 2.0, min: 0.0, name: "temperature", step: 0.01, value: params.value.temperature })} ${FloatField({ label: "Penalize repeat sequence", max: 2.0, min: 0.0, name: "repeat_penalty", step: 0.01, value: params.value.repeat_penalty })} ${IntField({ label: "Consider N tokens for penalize", max: 2048, min: 0, name: "repeat_last_n", value: params.value.repeat_last_n })} - ${BoolField({ label: "Penalize repetition of newlines", name: "penalize_nl", value: params.value.penalize_nl })} ${IntField({ label: "Top-K sampling", max: 100, min: -1, name: "top_k", value: params.value.top_k })} ${FloatField({ label: "Top-P sampling", max: 1.0, min: 0.0, name: "top_p", step: 0.01, value: params.value.top_p })} ${FloatField({ label: "Min-P sampling", max: 1.0, min: 0.0, name: "min_p", step: 0.01, value: params.value.min_p })} diff --git a/examples/server/utils.hpp b/examples/server/utils.hpp index 8f545aea52d..334f2f19207 100644 --- a/examples/server/utils.hpp +++ b/examples/server/utils.hpp @@ -3,6 +3,7 @@ #include "common.h" #include "log.h" #include "llama.h" +#include "common/base64.hpp" #ifndef NDEBUG // crash the server in debug mode, otherwise send an http 500 error @@ -22,7 +23,7 @@ #include #include -#define DEFAULT_OAICOMPAT_MODEL "gpt-3.5-turbo-0613" +#define DEFAULT_OAICOMPAT_MODEL "gpt-3.5-turbo" using json = nlohmann::ordered_json; @@ -56,6 +57,8 @@ static T json_value(const json & body, const std::string & key, const T & defaul } } +const static std::string build_info("b" + std::to_string(LLAMA_BUILD_NUMBER) + "-" + LLAMA_COMMIT); + // // tokenizer and input processing utils // @@ -88,6 +91,28 @@ static bool json_is_array_of_mixed_numbers_strings(const json & data) { return false; } +// get value by path(key1 / key2) +static json json_get_nested_values(const std::vector & paths, const json & js) { + json result = json::object(); + + for (const std::string & path : paths) { + json current = js; + const auto keys = string_split(path, /*separator*/ '/'); + bool valid_path = true; + for (const std::string & k : keys) { + if (valid_path && current.is_object() && current.contains(k)) { + current = current[k]; + } else { + valid_path = false; + } + } + if (valid_path) { + result[path] = current; + } + } + return result; +} + /** * this handles 2 cases: * - only string, example: "string" @@ -138,6 +163,7 @@ static llama_tokens tokenize_mixed(const llama_context * ctx, const json & json_ * and multiple prompts (multi-tasks): * - "prompt": ["string1", "string2"] * - "prompt": ["string1", [12, 34, 56]] + * - "prompt": [[12, 34, 56], [78, 90, 12]] * - "prompt": [[12, 34, "string", 56, 78], [12, 34, 56]] */ static std::vector tokenize_input_prompts(llama_context * ctx, const json & json_prompt, bool add_special, bool parse_special) { @@ -170,6 +196,36 @@ static std::vector tokenize_input_prompts(llama_context * ctx, con return result; } +// return the last index of character that can form a valid string +// if the last character is potentially cut in half, return the index before the cut +// if validate_utf8(text) == text.size(), then the whole text is valid utf8 +static size_t validate_utf8(const std::string& text) { + size_t len = text.size(); + if (len == 0) return 0; + + // Check the last few bytes to see if a multi-byte character is cut off + for (size_t i = 1; i <= 4 && i <= len; ++i) { + unsigned char c = text[len - i]; + // Check for start of a multi-byte sequence from the end + if ((c & 0xE0) == 0xC0) { + // 2-byte character start: 110xxxxx + // Needs at least 2 bytes + if (i < 2) return len - i; + } else if ((c & 0xF0) == 0xE0) { + // 3-byte character start: 1110xxxx + // Needs at least 3 bytes + if (i < 3) return len - i; + } else if ((c & 0xF8) == 0xF0) { + // 4-byte character start: 11110xxx + // Needs at least 4 bytes + if (i < 4) return len - i; + } + } + + // If no cut-off multi-byte character is found, return full length + return len; +} + // // template utils // @@ -333,7 +389,7 @@ static std::string llama_get_chat_template(const struct llama_model * model) { if (res < 2) { return ""; } else { - std::vector model_template(res, 0); + std::vector model_template(res + 1, 0); llama_model_meta_val_str(model, template_key.c_str(), model_template.data(), model_template.size()); return std::string(model_template.data(), model_template.size() - 1); } @@ -558,23 +614,41 @@ static json oaicompat_completion_params_parse( return llama_params; } -static json format_embeddings_response_oaicompat(const json & request, const json & embeddings) { +static json format_embeddings_response_oaicompat(const json & request, const json & embeddings, bool use_base64 = false) { json data = json::array(); + int32_t n_tokens = 0; int i = 0; for (const auto & elem : embeddings) { - data.push_back(json{ - {"embedding", json_value(elem, "embedding", json::array())}, - {"index", i++}, - {"object", "embedding"} - }); + json embedding_obj; + + if (use_base64) { + const auto& vec = json_value(elem, "embedding", json::array()).get>(); + const char* data_ptr = reinterpret_cast(vec.data()); + size_t data_size = vec.size() * sizeof(float); + embedding_obj = { + {"embedding", base64::encode(data_ptr, data_size)}, + {"index", i++}, + {"object", "embedding"}, + {"encoding_format", "base64"} + }; + } else { + embedding_obj = { + {"embedding", json_value(elem, "embedding", json::array())}, + {"index", i++}, + {"object", "embedding"} + }; + } + data.push_back(embedding_obj); + + n_tokens += json_value(elem, "tokens_evaluated", 0); } json res = json { {"model", json_value(request, "model", std::string(DEFAULT_OAICOMPAT_MODEL))}, {"object", "list"}, - {"usage", json { // TODO: fill - {"prompt_tokens", 0}, - {"total_tokens", 0} + {"usage", json { + {"prompt_tokens", n_tokens}, + {"total_tokens", n_tokens} }}, {"data", data} }; @@ -584,20 +658,23 @@ static json format_embeddings_response_oaicompat(const json & request, const jso static json format_response_rerank(const json & request, const json & ranks) { json data = json::array(); + int32_t n_tokens = 0; int i = 0; for (const auto & rank : ranks) { data.push_back(json{ {"index", i++}, {"relevance_score", json_value(rank, "score", 0.0)}, }); + + n_tokens += json_value(rank, "tokens_evaluated", 0); } json res = json { {"model", json_value(request, "model", std::string(DEFAULT_OAICOMPAT_MODEL))}, {"object", "list"}, - {"usage", json { // TODO: fill - {"prompt_tokens", 0}, - {"total_tokens", 0} + {"usage", json { + {"prompt_tokens", n_tokens}, + {"total_tokens", n_tokens} }}, {"results", data} }; @@ -664,3 +741,33 @@ static json format_logit_bias(const std::vector & logit_bias) static std::string safe_json_to_str(json data) { return data.dump(-1, ' ', false, json::error_handler_t::replace); } + +static std::vector get_token_probabilities(llama_context * ctx, int idx) { + std::vector cur; + const auto * logits = llama_get_logits_ith(ctx, idx); + const int n_vocab = llama_n_vocab(llama_get_model(ctx)); + + cur.resize(n_vocab); + for (llama_token token_id = 0; token_id < n_vocab; token_id++) { + cur[token_id] = llama_token_data{token_id, logits[token_id], 0.0f}; + } + + // sort tokens by logits + std::sort(cur.begin(), cur.end(), [](const llama_token_data & a, const llama_token_data & b) { + return a.logit > b.logit; + }); + + // apply softmax + float max_l = cur[0].logit; + float cum_sum = 0.0f; + for (size_t i = 0; i < cur.size(); ++i) { + float p = expf(cur[i].logit - max_l); + cur[i].p = p; + cum_sum += p; + } + for (size_t i = 0; i < cur.size(); ++i) { + cur[i].p /= cum_sum; + } + + return cur; +} diff --git a/examples/server/webui/index.html b/examples/server/webui/index.html index c7e18b45e1a..dcdd410797d 100644 --- a/examples/server/webui/index.html +++ b/examples/server/webui/index.html @@ -15,7 +15,7 @@
-
+

Conversations

@@ -120,51 +120,25 @@

Conversations

{{ messages.length === 0 ? 'Send a message to start' : '' }}
-
-
- - - - -
-
- - -
- - - - - -
+
-
-
- - -
+
+
@@ -227,6 +201,14 @@

Settings

Advanced config
+
+ + +
+
+ + Show tokens per second +
+ + + + +